scmkl.optimize_alpha
1import gc 2import numpy as np 3import anndata as ad 4from sklearn.model_selection import StratifiedKFold 5 6from scmkl.tfidf_normalize import tfidf_normalize 7from scmkl.calculate_z import calculate_z 8from scmkl.train_model import train_model 9from scmkl.multimodal_processing import multimodal_processing 10from scmkl.test import predict 11from scmkl.one_v_rest import get_class_train 12from scmkl.create_adata import sort_samples 13 14 15# Array of alphas to be used if not provided 16default_alphas = np.round(np.linspace(1.9, 0.05, 10),2) 17 18 19def stop_early(metric_array, alpha_idx, fold_idx): 20 """ 21 Assumes smallest alpha comes first. 22 """ 23 # Must be at least two metrics from two alphas to compare 24 if alpha_idx <= 0: 25 return False 26 27 cur_met = metric_array[alpha_idx, fold_idx] 28 last_met = metric_array[alpha_idx - 1, fold_idx] 29 30 if cur_met < last_met: 31 return True 32 else: 33 return False 34 35 36def sort_alphas(alpha_array: np.ndarray): 37 """ 38 Sorts alphas from smallest to largest. 39 """ 40 order = np.argsort(alpha_array) 41 alpha_array = alpha_array[order] 42 43 return alpha_array 44 45 46def get_folds(adata: ad.AnnData, k: int): 47 """ 48 With labels of samples for cross validation and number of folds, 49 returns the indices and label for each k-folds. 50 51 Parameters 52 ---------- 53 adata : ad.AnnData 54 `AnnData` object containing `'labels'` column in `.obs` and 55 `'train_indices'` in `.uns`. 56 57 k : int 58 The number of folds to perform cross validation over. 59 60 Returns 61 ------- 62 folds : dict 63 A dictionary with keys being [0, k) and values being a tuple 64 with first element being training sample indices and the second 65 being testing value indices. 66 67 Examples 68 -------- 69 >>> adata = scmkl.create_adata(...) 70 >>> folds = scmkl.get_folds(adata, k=4) 71 >>> 72 >>> train_fold_0, test_fold_0 = folds[0] 73 >>> 74 >>> train_fold_0 75 array([ 0, 3, 5, 6, 8, 9, 10, 11, 12, 13]) 76 >>> test_fold_0 77 array([ 1, 2, 4, 7]) 78 """ 79 y = adata.obs['labels'][adata.uns['train_indices']].copy() 80 81 # Creating dummy x prevents holding more views in memory 82 x = y.copy() 83 84 skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=100) 85 86 folds = dict() 87 for fold, (fold_train, fold_test) in enumerate(skf.split(x, y)): 88 folds[fold] = (fold_train, fold_test) 89 90 return folds 91 92 93def prepare_fold(fold_adata, sort_idcs, fold_train, fold_test): 94 """ 95 Reorders adata samples, reassigns each `adata.uns['train_indices' & 96 'test_indices']`, and removes sigmas if present. 97 98 Parameters 99 ---------- 100 fold_adata : list[ad.AnnData] 101 A `list` of `AnnData` objects to be reordered based on fold 102 indices. 103 104 sort_idcs: np.ndarray 105 The indices that will sort `AnnData`s as all train samples then 106 all test. 107 108 fold_train : np.ndarray 109 The indices of training sample for respective fold. 110 111 fold_test : np.ndarray 112 The indices of testing samples for respective fold. 113 114 Returns 115 ------- 116 fold_adata : list[ad.AnnData] 117 The `list` of `AnnData`s with each `AnnData` sorted as all 118 training samples then all testing samples. 119 `adata.uns['train_indices' & 'test_indices']` are also updated 120 to reflect new sample positions. 121 122 Examples 123 -------- 124 >>> adata = [scmkl.create_adata(...)] 125 >>> folds = scmkl.get_folds(adata[0], k=4) 126 >>> sort_idcs, fold_train, fold_test = sort_samples(fold_train, 127 ... fold_test) 128 >>> fold_adata = scmkl.prepare_fold(adata, sort_idcs, 129 ... fold_train, fold_test) 130 """ 131 for i in range(len(fold_adata)): 132 fold_adata[i] = fold_adata[i][sort_idcs] 133 fold_adata[i].uns['train_indices'] = fold_train 134 fold_adata[i].uns['test_indices'] = fold_test 135 136 # Need to recalculate sigmas for new fold train 137 for i in range(len(fold_adata)): 138 if 'sigma' in fold_adata[i].uns_keys(): 139 del fold_adata[i].uns['sigma'] 140 141 return fold_adata 142 143 144def process_fold(fold_adata, names, tfidf, combination, batches, batch_size): 145 """ 146 Combines adata if needed, estimates sigmas, and calculates kernels 147 for model training and evaluation. 148 149 Parameters 150 ---------- 151 fold_adata : list[ad.AnnData] 152 A `list` of `AnnData` objects to combine and calculate kernels 153 for. 154 155 names : list 156 A list of names respective to each `AnnData` in `fold_adata` 157 for verbose outputs. 158 159 tfidf : list[bool] 160 A boolean list indicating whether or not to perform TF-IDF 161 transformation for each adata respective to `fold_adata`. 162 163 combination : str 164 The method of combining `AnnData` objects passed to 165 `ad.concatenate()`. Ignored if `len(fold_adata) == 1`. 166 167 batches : int 168 The number of batches for kernel width (sigma) estimation. 169 170 batch_size : int 171 The number of samples to include in each batch of kernel width 172 (sigma) estimation. 173 174 Examples 175 -------- 176 >>> adata = [scmkl.create_adata(...)] 177 >>> tfidf = [False] 178 >>> names = ['adata1'] 179 >>> folds = scmkl.get_folds(adata[0], k=4) 180 >>> sort_idcs, fold_train, fold_test = sort_samples(fold_train, 181 ... fold_test) 182 >>> fold_adata = scmkl.prepare_fold(adata, sort_idcs, 183 ... fold_train, fold_test) 184 >>> fold_adata = scmkl.process_fold(fold_adata, names, tfidf) 185 """ 186 if 1 < len(fold_adata): 187 fold_adata = multimodal_processing(fold_adata, names, tfidf, 188 combination, batches, 189 batch_size, False) 190 else: 191 fold_adata = fold_adata[0] 192 if tfidf[0]: 193 fold_adata = tfidf_normalize(fold_adata, binarize=True) 194 fold_adata = calculate_z(fold_adata, n_features= 5000, 195 batches=batches, batch_size=batch_size) 196 197 return fold_adata 198 199 200def bin_optimize_alpha(adata: list[ad.AnnData], 201 group_size: int | None=None, 202 tfidf: list[bool]=False, 203 alpha_array: np.ndarray=default_alphas, 204 k: int=4, metric: str='AUROC', 205 early_stopping: bool=False, 206 batches: int=10, batch_size: int=100, 207 combination: str='concatenate'): 208 """ 209 Iteratively train a grouplasso model and update alpha to find the 210 parameter yielding best performing sparsity via k-fold cross 211 validation. This function currently only works for binary 212 experiments. Called by `scmkl.optimize_alpha()`. 213 214 Parameters 215 ---------- 216 adata : list[ad.AnnData] 217 List of `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 218 `adata.uns.keys()`. 219 220 group_size : None | int 221 Argument describing how the features are grouped. If `None`, 222 `2 * adata.uns['D']` will be used. For more information see 223 [celer documentation](https://mathurinm.github.io/celer/ 224 generated/celer.GroupLasso.html). 225 226 tfidf : list | bool 227 If `False`, no data will be TF-IDF transformed. If 228 `type(adata) is list` and TF-IDF transformation is desired for 229 all or some of the data, a bool list corresponding to `adata` 230 must be provided. To simply TF-IDF transform `adata` when 231 `type(adata) is ad.AnnData`, use `True`. 232 233 alpha_array : np.ndarray 234 Array of all alpha values to be tested. 235 236 k : int 237 Number of folds to perform cross validation over. 238 239 metric : str 240 Which metric to use to optimize alpha. Options are `'AUROC'`, 241 `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`. 242 243 batches : int 244 The number of batches to use for the distance calculation. 245 This will average the result of `batches` distance calculations 246 of `batch_size` randomly sampled cells. More batches will converge 247 to population distance values at the cost of scalability. 248 249 batch_size : int 250 The number of cells to include per batch for distance 251 calculations. Higher batch size will converge to population 252 distance values at the cost of scalability. If 253 `batches*batch_size > num_training_cells`, `batch_size` will be 254 reduced to `int(num_training_cells/batches)`. 255 256 Returns 257 ------- 258 alpha_star : float 259 The best performing alpha value from cross validation on 260 training data. 261 262 Examples 263 -------- 264 >>> alpha_star = scmkl.optimize_alpha(adata) 265 >>> alpha_star 266 0.1 267 """ 268 assert isinstance(k, int) and k > 0, "'k' must be positive" 269 270 import warnings 271 warnings.filterwarnings('ignore') 272 273 alpha_array = sort_alphas(alpha_array) 274 275 # Only want folds for training samples 276 train_indices = adata[0].uns['train_indices'].copy() 277 cv_adata = [adata[i][train_indices, :].copy() 278 for i in range(len(adata))] 279 280 folds = get_folds(adata[0], k) 281 282 metric_array = np.zeros((len(alpha_array), k)) 283 284 for fold in range(k): 285 286 fold_train, fold_test = folds[fold] 287 fold_adata = cv_adata.copy() 288 289 # Downstream functions expect train then test samples in adata(s) 290 sort_idcs, fold_train, fold_test = sort_samples(fold_train, fold_test) 291 fold_adata = prepare_fold(fold_adata, sort_idcs, 292 fold_train, fold_test) 293 294 names = ['Adata ' + str(i + 1) for i in range(len(cv_adata))] 295 296 # Adatas need combined if applicable and kernels calculated 297 fold_adata = process_fold(fold_adata, names, tfidf, combination, 298 batches, batch_size) 299 300 for i, alpha in enumerate(alpha_array): 301 302 fold_adata = train_model(fold_adata, group_size, alpha=alpha) 303 _, metrics = predict(fold_adata, metrics=[metric]) 304 metric_array[i, fold] = metrics[metric] 305 306 # If metrics are decreasing, cv stopped and moving to next fold 307 end_fold = stop_early(metric_array, alpha_idx=i, fold_idx=fold) 308 if end_fold and early_stopping: 309 break 310 311 del fold_adata 312 gc.collect() 313 314 del cv_adata 315 gc.collect() 316 317 # Need highest performing alpha for given metric 318 alpha_star = alpha_array[np.argmax(np.mean(metric_array, axis = 1))] 319 320 return alpha_star 321 322 323def multi_optimize_alpha(adata: list[ad.AnnData], group_size: int, 324 tfidf: list[bool]=[False], 325 alpha_array: np.ndarray=default_alphas, k: int=4, 326 metric: str='AUROC', early_stopping: bool=False, 327 batches: int=10, batch_size: int=100, 328 force_balance: bool=True, combination: str='concatenate', 329 train_dict: dict=None): 330 """ 331 Wrapper function for running k-fold cross validation for every 332 label in a multiclass experiment. Called by 333 `scmkl.optimize_alpha()`. 334 335 Parameters 336 ---------- 337 adata : list[ad.AnnData] 338 List of `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 339 `adata.uns.keys()`. 340 341 group_size : None | int 342 Argument describing how the features are grouped. If `None`, 343 `2 * adata.uns['D']` will be used. For more information see 344 [celer documentation](https://mathurinm.github.io/celer/ 345 generated/celer.GroupLasso.html). 346 347 tfidf : list[bool] 348 If `False`, no data will be TF-IDF transformed. If 349 `type(adata) is list` and TF-IDF transformation is desired for 350 all or some of the data, a bool list corresponding to `adata` 351 must be provided. To simply TF-IDF transform `adata` when 352 `type(adata) is ad.AnnData`, use `True`. 353 354 alpha_array : np.ndarray 355 Array of all alpha values to be tested. 356 357 k : int 358 Number of folds to perform cross validation over. 359 360 metric : str 361 Which metric to use to optimize alpha. Options are `'AUROC'`, 362 `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`. 363 364 batches : int 365 The number of batches to use for the distance calculation. 366 This will average the result of `batches` distance calculations 367 of `batch_size` randomly sampled cells. More batches will converge 368 to population distance values at the cost of scalability. 369 370 batch_size : int 371 The number of cells to include per batch for distance 372 calculations. Higher batch size will converge to population 373 distance values at the cost of scalability. If 374 `batches*batch_size > num_training_cells`, `batch_size` will be 375 reduced to `int(num_training_cells/batches)`. 376 377 force_balance: bool 378 If `True`, training sets will be balanced to reduce class label 379 imbalance for each iteration. Defaults to `False`. 380 381 other_factor : float 382 The ratio of cells to sample for the other class for each 383 model. For example, if classifying B cells with 100 B cells in 384 training, if `other_factor=1`, 100 cells that are not B cells 385 will be trained on with the B cells. This will be done for each 386 fold for each class if `force_balance` is `True`. 387 388 combination: str 389 How should multiple views of data be combined. For more details 390 see ad.concat. 391 392 train_dict: dict 393 A `dict` where each key is a class label and values are are the 394 indices to be trained with for that class for class balance. 395 All values must be present in each adata.uns['train_indices']. 396 397 Returns 398 ------- 399 alpha_star : dict 400 A dictionary with keys being class labels and values being the 401 best performing alpha parameter for that class as a float. 402 """ 403 classes = np.unique(adata[0].obs['labels']) 404 orig_labels = adata[0].obs['labels'].to_numpy().copy() 405 orig_train = adata[0].uns['train_indices'].copy() 406 407 if train_dict: 408 train_idcs = train_dict 409 else: 410 if force_balance: 411 train_idcs = get_class_train(adata[0].uns['train_indices'], 412 adata[0].obs['labels'], 413 adata[0].uns['seed_obj']) 414 else: 415 train_idcs = {ct: adata[0].uns['train_indices'].copy() 416 for ct in classes} 417 418 opt_alpha_dict = dict() 419 420 for cl in classes: 421 temp_classes = orig_labels.copy() 422 temp_classes[temp_classes != cl] = 'other' 423 424 for i in range(len(adata)): 425 adata[i].obs['labels'] = temp_classes.copy() 426 adata[i].uns['train_indices'] = train_idcs[cl] 427 428 opt_alpha_dict[cl] = bin_optimize_alpha(adata, group_size, tfidf, 429 alpha_array, k, metric, 430 early_stopping, batches, 431 batch_size, combination) 432 433 # Global adata obj will be permanently changed if not reset 434 for i in range(len(adata)): 435 adata[i].obs['labels'] = orig_labels 436 adata[i].uns['train_indices'] = orig_train 437 438 return opt_alpha_dict 439 440 441def optimize_alpha(adata: ad.AnnData | list[ad.AnnData], 442 group_size: int | None=None, tfidf: None | list[bool]=None, 443 alpha_array: np.ndarray=default_alphas, k: int=4, 444 metric: str='AUROC', early_stopping: bool=False, 445 batches: int=10, batch_size: int=100, 446 combination: str='concatenate', force_balance: bool=True, 447 train_dict: dict=None): 448 """ 449 K-fold cross validation for optimizing alpha hyperparameter using 450 training indices. 451 452 Parameters 453 ---------- 454 adata : list[ad.AnnData] 455 List of `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 456 `adata.uns.keys()`. 457 458 group_size : None | int 459 Argument describing how the features are grouped. If `None`, 460 `2 * adata.uns['D']` will be used. For more information see 461 [celer documentation](https://mathurinm.github.io/celer/ 462 generated/celer.GroupLasso.html). 463 464 tfidf : list[bool] 465 If `False`, no data will be TF-IDF transformed. If 466 `type(adata) is list` and TF-IDF transformation is desired for 467 all or some of the data, a bool list corresponding to `adata` 468 must be provided. To simply TF-IDF transform `adata` when 469 `type(adata) is ad.AnnData`, use `True`. 470 471 alpha_array : np.ndarray 472 Array of all alpha values to be tested. 473 474 k : int 475 Number of folds to perform cross validation over. 476 477 metric : str 478 Which metric to use to optimize alpha. Options are `'AUROC'`, 479 `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`. 480 481 batches : int 482 The number of batches to use for the distance calculation. 483 This will average the result of `batches` distance calculations 484 of `batch_size` randomly sampled cells. More batches will converge 485 to population distance values at the cost of scalability. 486 487 batch_size : int 488 The number of cells to include per batch for distance 489 calculations. Higher batch size will converge to population 490 distance values at the cost of scalability. If 491 `batches*batch_size > num_training_cells`, `batch_size` will be 492 reduced to `int(num_training_cells/batches)`. 493 494 force_balance: bool 495 If `True`, training sets will be balanced to reduce class label 496 imbalance for each iteration. Defaults to `False`. 497 498 other_factor : float 499 The ratio of cells to sample for the other class for each 500 model. For example, if classifying B cells with 100 B cells in 501 training, if `other_factor=1`, 100 cells that are not B cells 502 will be trained on with the B cells. This will be done for each 503 fold for each class if `force_balance` is `True`. 504 505 combination: str 506 How should multiple views of data be combined. For more details 507 see ad.concat. 508 509 train_dict: dict 510 A `dict` where each key is a class label and values are are the 511 indices to be trained with for that class for class balance. 512 All values must be present in each adata.uns['train_indices']. 513 514 Returns 515 ------- 516 alpha_star : float | dict 517 If number of classes is more than 2, a dictionary with keys 518 being class labels and values being the best performing alpha 519 parameter for that class as a float. Else, a float for 520 comparing the two classes. 521 """ 522 # Need singe-view runs to be iterable 523 if isinstance(adata, ad.AnnData): 524 adata = [adata.copy()] 525 526 if isinstance(tfidf, type(None)): 527 tfidf = len(adata)*[False] 528 529 is_multi = 2 < len(set(adata[0].obs['labels'])) 530 531 if isinstance(group_size, type(None)): 532 group_size = 2*adata[0].uns['D'] 533 534 535 if is_multi: 536 alpha_star = multi_optimize_alpha(adata, group_size, tfidf, 537 alpha_array, k, metric, 538 early_stopping, batches, 539 batch_size, force_balance, 540 combination, train_dict) 541 542 else: 543 alpha_star = bin_optimize_alpha(adata, group_size, tfidf, 544 alpha_array, k, metric, 545 early_stopping, batches, 546 batch_size, combination) 547 548 gc.collect() 549 550 return alpha_star
20def stop_early(metric_array, alpha_idx, fold_idx): 21 """ 22 Assumes smallest alpha comes first. 23 """ 24 # Must be at least two metrics from two alphas to compare 25 if alpha_idx <= 0: 26 return False 27 28 cur_met = metric_array[alpha_idx, fold_idx] 29 last_met = metric_array[alpha_idx - 1, fold_idx] 30 31 if cur_met < last_met: 32 return True 33 else: 34 return False
Assumes smallest alpha comes first.
37def sort_alphas(alpha_array: np.ndarray): 38 """ 39 Sorts alphas from smallest to largest. 40 """ 41 order = np.argsort(alpha_array) 42 alpha_array = alpha_array[order] 43 44 return alpha_array
Sorts alphas from smallest to largest.
47def get_folds(adata: ad.AnnData, k: int): 48 """ 49 With labels of samples for cross validation and number of folds, 50 returns the indices and label for each k-folds. 51 52 Parameters 53 ---------- 54 adata : ad.AnnData 55 `AnnData` object containing `'labels'` column in `.obs` and 56 `'train_indices'` in `.uns`. 57 58 k : int 59 The number of folds to perform cross validation over. 60 61 Returns 62 ------- 63 folds : dict 64 A dictionary with keys being [0, k) and values being a tuple 65 with first element being training sample indices and the second 66 being testing value indices. 67 68 Examples 69 -------- 70 >>> adata = scmkl.create_adata(...) 71 >>> folds = scmkl.get_folds(adata, k=4) 72 >>> 73 >>> train_fold_0, test_fold_0 = folds[0] 74 >>> 75 >>> train_fold_0 76 array([ 0, 3, 5, 6, 8, 9, 10, 11, 12, 13]) 77 >>> test_fold_0 78 array([ 1, 2, 4, 7]) 79 """ 80 y = adata.obs['labels'][adata.uns['train_indices']].copy() 81 82 # Creating dummy x prevents holding more views in memory 83 x = y.copy() 84 85 skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=100) 86 87 folds = dict() 88 for fold, (fold_train, fold_test) in enumerate(skf.split(x, y)): 89 folds[fold] = (fold_train, fold_test) 90 91 return folds
With labels of samples for cross validation and number of folds, returns the indices and label for each k-folds.
Parameters
- adata (ad.AnnData):
AnnDataobject containing'labels'column in.obsand'train_indices'in.uns. - k (int): The number of folds to perform cross validation over.
Returns
- folds (dict): A dictionary with keys being [0, k) and values being a tuple with first element being training sample indices and the second being testing value indices.
Examples
>>> adata = scmkl.create_adata(...)
>>> folds = scmkl.get_folds(adata, k=4)
>>>
>>> train_fold_0, test_fold_0 = folds[0]
>>>
>>> train_fold_0
array([ 0, 3, 5, 6, 8, 9, 10, 11, 12, 13])
>>> test_fold_0
array([ 1, 2, 4, 7])
94def prepare_fold(fold_adata, sort_idcs, fold_train, fold_test): 95 """ 96 Reorders adata samples, reassigns each `adata.uns['train_indices' & 97 'test_indices']`, and removes sigmas if present. 98 99 Parameters 100 ---------- 101 fold_adata : list[ad.AnnData] 102 A `list` of `AnnData` objects to be reordered based on fold 103 indices. 104 105 sort_idcs: np.ndarray 106 The indices that will sort `AnnData`s as all train samples then 107 all test. 108 109 fold_train : np.ndarray 110 The indices of training sample for respective fold. 111 112 fold_test : np.ndarray 113 The indices of testing samples for respective fold. 114 115 Returns 116 ------- 117 fold_adata : list[ad.AnnData] 118 The `list` of `AnnData`s with each `AnnData` sorted as all 119 training samples then all testing samples. 120 `adata.uns['train_indices' & 'test_indices']` are also updated 121 to reflect new sample positions. 122 123 Examples 124 -------- 125 >>> adata = [scmkl.create_adata(...)] 126 >>> folds = scmkl.get_folds(adata[0], k=4) 127 >>> sort_idcs, fold_train, fold_test = sort_samples(fold_train, 128 ... fold_test) 129 >>> fold_adata = scmkl.prepare_fold(adata, sort_idcs, 130 ... fold_train, fold_test) 131 """ 132 for i in range(len(fold_adata)): 133 fold_adata[i] = fold_adata[i][sort_idcs] 134 fold_adata[i].uns['train_indices'] = fold_train 135 fold_adata[i].uns['test_indices'] = fold_test 136 137 # Need to recalculate sigmas for new fold train 138 for i in range(len(fold_adata)): 139 if 'sigma' in fold_adata[i].uns_keys(): 140 del fold_adata[i].uns['sigma'] 141 142 return fold_adata
Reorders adata samples, reassigns each adata.uns['train_indices' &
'test_indices'], and removes sigmas if present.
Parameters
- fold_adata (list[ad.AnnData]):
A
listofAnnDataobjects to be reordered based on fold indices. - sort_idcs (np.ndarray):
The indices that will sort
AnnDatas as all train samples then all test. - fold_train (np.ndarray): The indices of training sample for respective fold.
- fold_test (np.ndarray): The indices of testing samples for respective fold.
Returns
- fold_adata (list[ad.AnnData]):
The
listofAnnDatas with eachAnnDatasorted as all training samples then all testing samples.adata.uns['train_indices' & 'test_indices']are also updated to reflect new sample positions.
Examples
>>> adata = [scmkl.create_adata(...)]
>>> folds = scmkl.get_folds(adata[0], k=4)
>>> sort_idcs, fold_train, fold_test = sort_samples(fold_train,
... fold_test)
>>> fold_adata = scmkl.prepare_fold(adata, sort_idcs,
... fold_train, fold_test)
145def process_fold(fold_adata, names, tfidf, combination, batches, batch_size): 146 """ 147 Combines adata if needed, estimates sigmas, and calculates kernels 148 for model training and evaluation. 149 150 Parameters 151 ---------- 152 fold_adata : list[ad.AnnData] 153 A `list` of `AnnData` objects to combine and calculate kernels 154 for. 155 156 names : list 157 A list of names respective to each `AnnData` in `fold_adata` 158 for verbose outputs. 159 160 tfidf : list[bool] 161 A boolean list indicating whether or not to perform TF-IDF 162 transformation for each adata respective to `fold_adata`. 163 164 combination : str 165 The method of combining `AnnData` objects passed to 166 `ad.concatenate()`. Ignored if `len(fold_adata) == 1`. 167 168 batches : int 169 The number of batches for kernel width (sigma) estimation. 170 171 batch_size : int 172 The number of samples to include in each batch of kernel width 173 (sigma) estimation. 174 175 Examples 176 -------- 177 >>> adata = [scmkl.create_adata(...)] 178 >>> tfidf = [False] 179 >>> names = ['adata1'] 180 >>> folds = scmkl.get_folds(adata[0], k=4) 181 >>> sort_idcs, fold_train, fold_test = sort_samples(fold_train, 182 ... fold_test) 183 >>> fold_adata = scmkl.prepare_fold(adata, sort_idcs, 184 ... fold_train, fold_test) 185 >>> fold_adata = scmkl.process_fold(fold_adata, names, tfidf) 186 """ 187 if 1 < len(fold_adata): 188 fold_adata = multimodal_processing(fold_adata, names, tfidf, 189 combination, batches, 190 batch_size, False) 191 else: 192 fold_adata = fold_adata[0] 193 if tfidf[0]: 194 fold_adata = tfidf_normalize(fold_adata, binarize=True) 195 fold_adata = calculate_z(fold_adata, n_features= 5000, 196 batches=batches, batch_size=batch_size) 197 198 return fold_adata
Combines adata if needed, estimates sigmas, and calculates kernels for model training and evaluation.
Parameters
- fold_adata (list[ad.AnnData]):
A
listofAnnDataobjects to combine and calculate kernels for. - names (list):
A list of names respective to each
AnnDatainfold_adatafor verbose outputs. - tfidf (list[bool]):
A boolean list indicating whether or not to perform TF-IDF
transformation for each adata respective to
fold_adata. - combination (str):
The method of combining
AnnDataobjects passed toad.concatenate(). Ignored iflen(fold_adata) == 1. - batches (int): The number of batches for kernel width (sigma) estimation.
- batch_size (int): The number of samples to include in each batch of kernel width (sigma) estimation.
Examples
>>> adata = [scmkl.create_adata(...)]
>>> tfidf = [False]
>>> names = ['adata1']
>>> folds = scmkl.get_folds(adata[0], k=4)
>>> sort_idcs, fold_train, fold_test = sort_samples(fold_train,
... fold_test)
>>> fold_adata = scmkl.prepare_fold(adata, sort_idcs,
... fold_train, fold_test)
>>> fold_adata = scmkl.process_fold(fold_adata, names, tfidf)
201def bin_optimize_alpha(adata: list[ad.AnnData], 202 group_size: int | None=None, 203 tfidf: list[bool]=False, 204 alpha_array: np.ndarray=default_alphas, 205 k: int=4, metric: str='AUROC', 206 early_stopping: bool=False, 207 batches: int=10, batch_size: int=100, 208 combination: str='concatenate'): 209 """ 210 Iteratively train a grouplasso model and update alpha to find the 211 parameter yielding best performing sparsity via k-fold cross 212 validation. This function currently only works for binary 213 experiments. Called by `scmkl.optimize_alpha()`. 214 215 Parameters 216 ---------- 217 adata : list[ad.AnnData] 218 List of `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 219 `adata.uns.keys()`. 220 221 group_size : None | int 222 Argument describing how the features are grouped. If `None`, 223 `2 * adata.uns['D']` will be used. For more information see 224 [celer documentation](https://mathurinm.github.io/celer/ 225 generated/celer.GroupLasso.html). 226 227 tfidf : list | bool 228 If `False`, no data will be TF-IDF transformed. If 229 `type(adata) is list` and TF-IDF transformation is desired for 230 all or some of the data, a bool list corresponding to `adata` 231 must be provided. To simply TF-IDF transform `adata` when 232 `type(adata) is ad.AnnData`, use `True`. 233 234 alpha_array : np.ndarray 235 Array of all alpha values to be tested. 236 237 k : int 238 Number of folds to perform cross validation over. 239 240 metric : str 241 Which metric to use to optimize alpha. Options are `'AUROC'`, 242 `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`. 243 244 batches : int 245 The number of batches to use for the distance calculation. 246 This will average the result of `batches` distance calculations 247 of `batch_size` randomly sampled cells. More batches will converge 248 to population distance values at the cost of scalability. 249 250 batch_size : int 251 The number of cells to include per batch for distance 252 calculations. Higher batch size will converge to population 253 distance values at the cost of scalability. If 254 `batches*batch_size > num_training_cells`, `batch_size` will be 255 reduced to `int(num_training_cells/batches)`. 256 257 Returns 258 ------- 259 alpha_star : float 260 The best performing alpha value from cross validation on 261 training data. 262 263 Examples 264 -------- 265 >>> alpha_star = scmkl.optimize_alpha(adata) 266 >>> alpha_star 267 0.1 268 """ 269 assert isinstance(k, int) and k > 0, "'k' must be positive" 270 271 import warnings 272 warnings.filterwarnings('ignore') 273 274 alpha_array = sort_alphas(alpha_array) 275 276 # Only want folds for training samples 277 train_indices = adata[0].uns['train_indices'].copy() 278 cv_adata = [adata[i][train_indices, :].copy() 279 for i in range(len(adata))] 280 281 folds = get_folds(adata[0], k) 282 283 metric_array = np.zeros((len(alpha_array), k)) 284 285 for fold in range(k): 286 287 fold_train, fold_test = folds[fold] 288 fold_adata = cv_adata.copy() 289 290 # Downstream functions expect train then test samples in adata(s) 291 sort_idcs, fold_train, fold_test = sort_samples(fold_train, fold_test) 292 fold_adata = prepare_fold(fold_adata, sort_idcs, 293 fold_train, fold_test) 294 295 names = ['Adata ' + str(i + 1) for i in range(len(cv_adata))] 296 297 # Adatas need combined if applicable and kernels calculated 298 fold_adata = process_fold(fold_adata, names, tfidf, combination, 299 batches, batch_size) 300 301 for i, alpha in enumerate(alpha_array): 302 303 fold_adata = train_model(fold_adata, group_size, alpha=alpha) 304 _, metrics = predict(fold_adata, metrics=[metric]) 305 metric_array[i, fold] = metrics[metric] 306 307 # If metrics are decreasing, cv stopped and moving to next fold 308 end_fold = stop_early(metric_array, alpha_idx=i, fold_idx=fold) 309 if end_fold and early_stopping: 310 break 311 312 del fold_adata 313 gc.collect() 314 315 del cv_adata 316 gc.collect() 317 318 # Need highest performing alpha for given metric 319 alpha_star = alpha_array[np.argmax(np.mean(metric_array, axis = 1))] 320 321 return alpha_star
Iteratively train a grouplasso model and update alpha to find the
parameter yielding best performing sparsity via k-fold cross
validation. This function currently only works for binary
experiments. Called by scmkl.optimize_alpha.
Parameters
- adata (list[ad.AnnData]):
List of
ad.AnnData(s) with'Z_train'and'Z_test'inadata.uns.keys(). - group_size (None | int):
Argument describing how the features are grouped. If
None,2 * adata.uns['D']will be used. For more information see celer documentation. - tfidf (list | bool):
If
False, no data will be TF-IDF transformed. Iftype(adata) is listand TF-IDF transformation is desired for all or some of the data, a bool list corresponding toadatamust be provided. To simply TF-IDF transformadatawhentype(adata) is ad.AnnData, useTrue. - alpha_array (np.ndarray): Array of all alpha values to be tested.
- k (int): Number of folds to perform cross validation over.
- metric (str):
Which metric to use to optimize alpha. Options are
'AUROC','Accuracy','F1-Score','Precision', and'Recall'. - batches (int):
The number of batches to use for the distance calculation.
This will average the result of
batchesdistance calculations ofbatch_sizerandomly sampled cells. More batches will converge to population distance values at the cost of scalability. - batch_size (int):
The number of cells to include per batch for distance
calculations. Higher batch size will converge to population
distance values at the cost of scalability. If
batches*batch_size > num_training_cells,batch_sizewill be reduced toint(num_training_cells/batches).
Returns
- alpha_star (float): The best performing alpha value from cross validation on training data.
Examples
>>> alpha_star = scmkl.optimize_alpha(adata)
>>> alpha_star
0.1
324def multi_optimize_alpha(adata: list[ad.AnnData], group_size: int, 325 tfidf: list[bool]=[False], 326 alpha_array: np.ndarray=default_alphas, k: int=4, 327 metric: str='AUROC', early_stopping: bool=False, 328 batches: int=10, batch_size: int=100, 329 force_balance: bool=True, combination: str='concatenate', 330 train_dict: dict=None): 331 """ 332 Wrapper function for running k-fold cross validation for every 333 label in a multiclass experiment. Called by 334 `scmkl.optimize_alpha()`. 335 336 Parameters 337 ---------- 338 adata : list[ad.AnnData] 339 List of `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 340 `adata.uns.keys()`. 341 342 group_size : None | int 343 Argument describing how the features are grouped. If `None`, 344 `2 * adata.uns['D']` will be used. For more information see 345 [celer documentation](https://mathurinm.github.io/celer/ 346 generated/celer.GroupLasso.html). 347 348 tfidf : list[bool] 349 If `False`, no data will be TF-IDF transformed. If 350 `type(adata) is list` and TF-IDF transformation is desired for 351 all or some of the data, a bool list corresponding to `adata` 352 must be provided. To simply TF-IDF transform `adata` when 353 `type(adata) is ad.AnnData`, use `True`. 354 355 alpha_array : np.ndarray 356 Array of all alpha values to be tested. 357 358 k : int 359 Number of folds to perform cross validation over. 360 361 metric : str 362 Which metric to use to optimize alpha. Options are `'AUROC'`, 363 `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`. 364 365 batches : int 366 The number of batches to use for the distance calculation. 367 This will average the result of `batches` distance calculations 368 of `batch_size` randomly sampled cells. More batches will converge 369 to population distance values at the cost of scalability. 370 371 batch_size : int 372 The number of cells to include per batch for distance 373 calculations. Higher batch size will converge to population 374 distance values at the cost of scalability. If 375 `batches*batch_size > num_training_cells`, `batch_size` will be 376 reduced to `int(num_training_cells/batches)`. 377 378 force_balance: bool 379 If `True`, training sets will be balanced to reduce class label 380 imbalance for each iteration. Defaults to `False`. 381 382 other_factor : float 383 The ratio of cells to sample for the other class for each 384 model. For example, if classifying B cells with 100 B cells in 385 training, if `other_factor=1`, 100 cells that are not B cells 386 will be trained on with the B cells. This will be done for each 387 fold for each class if `force_balance` is `True`. 388 389 combination: str 390 How should multiple views of data be combined. For more details 391 see ad.concat. 392 393 train_dict: dict 394 A `dict` where each key is a class label and values are are the 395 indices to be trained with for that class for class balance. 396 All values must be present in each adata.uns['train_indices']. 397 398 Returns 399 ------- 400 alpha_star : dict 401 A dictionary with keys being class labels and values being the 402 best performing alpha parameter for that class as a float. 403 """ 404 classes = np.unique(adata[0].obs['labels']) 405 orig_labels = adata[0].obs['labels'].to_numpy().copy() 406 orig_train = adata[0].uns['train_indices'].copy() 407 408 if train_dict: 409 train_idcs = train_dict 410 else: 411 if force_balance: 412 train_idcs = get_class_train(adata[0].uns['train_indices'], 413 adata[0].obs['labels'], 414 adata[0].uns['seed_obj']) 415 else: 416 train_idcs = {ct: adata[0].uns['train_indices'].copy() 417 for ct in classes} 418 419 opt_alpha_dict = dict() 420 421 for cl in classes: 422 temp_classes = orig_labels.copy() 423 temp_classes[temp_classes != cl] = 'other' 424 425 for i in range(len(adata)): 426 adata[i].obs['labels'] = temp_classes.copy() 427 adata[i].uns['train_indices'] = train_idcs[cl] 428 429 opt_alpha_dict[cl] = bin_optimize_alpha(adata, group_size, tfidf, 430 alpha_array, k, metric, 431 early_stopping, batches, 432 batch_size, combination) 433 434 # Global adata obj will be permanently changed if not reset 435 for i in range(len(adata)): 436 adata[i].obs['labels'] = orig_labels 437 adata[i].uns['train_indices'] = orig_train 438 439 return opt_alpha_dict
Wrapper function for running k-fold cross validation for every
label in a multiclass experiment. Called by
scmkl.optimize_alpha.
Parameters
- adata (list[ad.AnnData]):
List of
ad.AnnData(s) with'Z_train'and'Z_test'inadata.uns.keys(). - group_size (None | int):
Argument describing how the features are grouped. If
None,2 * adata.uns['D']will be used. For more information see celer documentation. - tfidf (list[bool]):
If
False, no data will be TF-IDF transformed. Iftype(adata) is listand TF-IDF transformation is desired for all or some of the data, a bool list corresponding toadatamust be provided. To simply TF-IDF transformadatawhentype(adata) is ad.AnnData, useTrue. - alpha_array (np.ndarray): Array of all alpha values to be tested.
- k (int): Number of folds to perform cross validation over.
- metric (str):
Which metric to use to optimize alpha. Options are
'AUROC','Accuracy','F1-Score','Precision', and'Recall'. - batches (int):
The number of batches to use for the distance calculation.
This will average the result of
batchesdistance calculations ofbatch_sizerandomly sampled cells. More batches will converge to population distance values at the cost of scalability. - batch_size (int):
The number of cells to include per batch for distance
calculations. Higher batch size will converge to population
distance values at the cost of scalability. If
batches*batch_size > num_training_cells,batch_sizewill be reduced toint(num_training_cells/batches). - force_balance (bool):
If
True, training sets will be balanced to reduce class label imbalance for each iteration. Defaults toFalse. - other_factor (float):
The ratio of cells to sample for the other class for each
model. For example, if classifying B cells with 100 B cells in
training, if
other_factor=1, 100 cells that are not B cells will be trained on with the B cells. This will be done for each fold for each class ifforce_balanceisTrue. - combination (str): How should multiple views of data be combined. For more details see ad.concat.
- train_dict (dict):
A
dictwhere each key is a class label and values are are the indices to be trained with for that class for class balance. All values must be present in each adata.uns['train_indices'].
Returns
- alpha_star (dict): A dictionary with keys being class labels and values being the best performing alpha parameter for that class as a float.
442def optimize_alpha(adata: ad.AnnData | list[ad.AnnData], 443 group_size: int | None=None, tfidf: None | list[bool]=None, 444 alpha_array: np.ndarray=default_alphas, k: int=4, 445 metric: str='AUROC', early_stopping: bool=False, 446 batches: int=10, batch_size: int=100, 447 combination: str='concatenate', force_balance: bool=True, 448 train_dict: dict=None): 449 """ 450 K-fold cross validation for optimizing alpha hyperparameter using 451 training indices. 452 453 Parameters 454 ---------- 455 adata : list[ad.AnnData] 456 List of `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 457 `adata.uns.keys()`. 458 459 group_size : None | int 460 Argument describing how the features are grouped. If `None`, 461 `2 * adata.uns['D']` will be used. For more information see 462 [celer documentation](https://mathurinm.github.io/celer/ 463 generated/celer.GroupLasso.html). 464 465 tfidf : list[bool] 466 If `False`, no data will be TF-IDF transformed. If 467 `type(adata) is list` and TF-IDF transformation is desired for 468 all or some of the data, a bool list corresponding to `adata` 469 must be provided. To simply TF-IDF transform `adata` when 470 `type(adata) is ad.AnnData`, use `True`. 471 472 alpha_array : np.ndarray 473 Array of all alpha values to be tested. 474 475 k : int 476 Number of folds to perform cross validation over. 477 478 metric : str 479 Which metric to use to optimize alpha. Options are `'AUROC'`, 480 `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`. 481 482 batches : int 483 The number of batches to use for the distance calculation. 484 This will average the result of `batches` distance calculations 485 of `batch_size` randomly sampled cells. More batches will converge 486 to population distance values at the cost of scalability. 487 488 batch_size : int 489 The number of cells to include per batch for distance 490 calculations. Higher batch size will converge to population 491 distance values at the cost of scalability. If 492 `batches*batch_size > num_training_cells`, `batch_size` will be 493 reduced to `int(num_training_cells/batches)`. 494 495 force_balance: bool 496 If `True`, training sets will be balanced to reduce class label 497 imbalance for each iteration. Defaults to `False`. 498 499 other_factor : float 500 The ratio of cells to sample for the other class for each 501 model. For example, if classifying B cells with 100 B cells in 502 training, if `other_factor=1`, 100 cells that are not B cells 503 will be trained on with the B cells. This will be done for each 504 fold for each class if `force_balance` is `True`. 505 506 combination: str 507 How should multiple views of data be combined. For more details 508 see ad.concat. 509 510 train_dict: dict 511 A `dict` where each key is a class label and values are are the 512 indices to be trained with for that class for class balance. 513 All values must be present in each adata.uns['train_indices']. 514 515 Returns 516 ------- 517 alpha_star : float | dict 518 If number of classes is more than 2, a dictionary with keys 519 being class labels and values being the best performing alpha 520 parameter for that class as a float. Else, a float for 521 comparing the two classes. 522 """ 523 # Need singe-view runs to be iterable 524 if isinstance(adata, ad.AnnData): 525 adata = [adata.copy()] 526 527 if isinstance(tfidf, type(None)): 528 tfidf = len(adata)*[False] 529 530 is_multi = 2 < len(set(adata[0].obs['labels'])) 531 532 if isinstance(group_size, type(None)): 533 group_size = 2*adata[0].uns['D'] 534 535 536 if is_multi: 537 alpha_star = multi_optimize_alpha(adata, group_size, tfidf, 538 alpha_array, k, metric, 539 early_stopping, batches, 540 batch_size, force_balance, 541 combination, train_dict) 542 543 else: 544 alpha_star = bin_optimize_alpha(adata, group_size, tfidf, 545 alpha_array, k, metric, 546 early_stopping, batches, 547 batch_size, combination) 548 549 gc.collect() 550 551 return alpha_star
K-fold cross validation for optimizing alpha hyperparameter using training indices.
Parameters
- adata (list[ad.AnnData]):
List of
ad.AnnData(s) with'Z_train'and'Z_test'inadata.uns.keys(). - group_size (None | int):
Argument describing how the features are grouped. If
None,2 * adata.uns['D']will be used. For more information see celer documentation. - tfidf (list[bool]):
If
False, no data will be TF-IDF transformed. Iftype(adata) is listand TF-IDF transformation is desired for all or some of the data, a bool list corresponding toadatamust be provided. To simply TF-IDF transformadatawhentype(adata) is ad.AnnData, useTrue. - alpha_array (np.ndarray): Array of all alpha values to be tested.
- k (int): Number of folds to perform cross validation over.
- metric (str):
Which metric to use to optimize alpha. Options are
'AUROC','Accuracy','F1-Score','Precision', and'Recall'. - batches (int):
The number of batches to use for the distance calculation.
This will average the result of
batchesdistance calculations ofbatch_sizerandomly sampled cells. More batches will converge to population distance values at the cost of scalability. - batch_size (int):
The number of cells to include per batch for distance
calculations. Higher batch size will converge to population
distance values at the cost of scalability. If
batches*batch_size > num_training_cells,batch_sizewill be reduced toint(num_training_cells/batches). - force_balance (bool):
If
True, training sets will be balanced to reduce class label imbalance for each iteration. Defaults toFalse. - other_factor (float):
The ratio of cells to sample for the other class for each
model. For example, if classifying B cells with 100 B cells in
training, if
other_factor=1, 100 cells that are not B cells will be trained on with the B cells. This will be done for each fold for each class ifforce_balanceisTrue. - combination (str): How should multiple views of data be combined. For more details see ad.concat.
- train_dict (dict):
A
dictwhere each key is a class label and values are are the indices to be trained with for that class for class balance. All values must be present in each adata.uns['train_indices'].
Returns
- alpha_star (float | dict): If number of classes is more than 2, a dictionary with keys being class labels and values being the best performing alpha parameter for that class as a float. Else, a float for comparing the two classes.