scmkl.optimize_alpha
1import numpy as np 2import anndata as ad 3import gc 4import tracemalloc 5 6from scmkl.tfidf_normalize import tfidf_normalize 7from scmkl.estimate_sigma import estimate_sigma 8from scmkl.calculate_z import calculate_z 9from scmkl.train_model import train_model 10from scmkl.multimodal_processing import multimodal_processing 11from scmkl.test import predict 12 13 14def multimodal_optimize_alpha(adatas: list[ad.AnnData], group_size: int, 15 tfidf_list: list | bool=False, 16 alpha_array: np.ndarray=np.round(np.linspace(1.9,0.1, 10),2), 17 k: int=4, metric: str='AUROC', 18 batches: int=10, batch_size: int=100): 19 """ 20 Iteratively train a grouplasso model and update alpha to find the 21 parameter yielding the desired sparsity. Meant to find a good 22 starting point for your model, and the alpha may need further fine 23 tuning. 24 25 Parameters 26 ---------- 27 adatas : list[ad.AnnData] 28 Objects of type `ad.AnnData` where each object is one modality 29 and Z_train and Z_test are calculated 30 31 group_size : None | int 32 Argument describing how the features are grouped. If `None`, 33 `2 * adata.uns['D']` will be used. For more information see 34 [celer documentation](https://mathurinm.github.io/celer/ 35 generated/celer.GroupLasso.html). 36 37 tfidf_list : list | None 38 A boolean mask where `tfidf_list[i]` is respective to 39 `adatas[i]`. If `True`, TF-IDF normalization will be applied to 40 the respective `ad.AnnData` during cross validation 41 42 alpha_array : np.ndarray 43 All alpha values to be tested. 44 45 k : int 46 Number of folds to perform cross validation over. 47 48 metric : str 49 Which metric to use to optimize alpha. Options are `'AUROC'`, 50 `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`. 51 52 batches : int 53 The number of batches to use for the distance calculation. 54 This will average the result of `batches` distance calculations 55 of `batch_size` randomly sampled cells. More batches will converge 56 to population distance values at the cost of scalability. 57 58 batch_size : int 59 The number of cells to include per batch for distance 60 calculations. Higher batch size will converge to population 61 distance values at the cost of scalability. If 62 `batches*batch_size > num_training_cells`, `batch_size` will be 63 reduced to `int(num_training_cells/batches)`. 64 65 Returns 66 ------- 67 alpha_star : float 68 The alpha value yielding the best performing model from cross 69 validation. 70 """ 71 assert isinstance(k, int) and k > 0, 'Must be a positive integer number of folds' 72 73 import warnings 74 warnings.filterwarnings('ignore') 75 76 if not tfidf_list: 77 tfidf_list = [False]*len(adatas) 78 79 y = adatas[0].obs['labels'].iloc[adatas[0].uns['train_indices']].to_numpy() 80 81 # Splits the labels evenly between folds 82 positive_indices = np.where(y == np.unique(y)[0])[0] 83 negative_indices = np.setdiff1d(np.arange(len(y)), positive_indices) 84 85 positive_annotations = np.arange(len(positive_indices)) % k 86 negative_annotations = np.arange(len(negative_indices)) % k 87 88 metric_array = np.zeros((len(alpha_array), k)) 89 90 cv_adatas = [] 91 92 for adata in adatas: 93 cv_adatas.append(adata[adata.uns['train_indices'],:].copy()) 94 95 del adatas 96 gc.collect() 97 98 for fold in np.arange(k): 99 100 fold_train = np.concatenate((positive_indices[np.where(positive_annotations != fold)[0]], 101 negative_indices[np.where(negative_annotations != fold)[0]])) 102 fold_test = np.concatenate((positive_indices[np.where(positive_annotations == fold)[0]], 103 negative_indices[np.where(negative_annotations == fold)[0]])) 104 105 for i in range(len(cv_adatas)): 106 cv_adatas[i].uns['train_indices'] = fold_train 107 cv_adatas[i].uns['test_indices'] = fold_test 108 109 # Creating dummy names for cv. 110 # Necessary for interpretability but not for AUROC cv 111 dummy_names = [f'adata {i}' for i in range(len(cv_adatas))] 112 113 # Calculate the Z's for each modality independently 114 fold_cv_adata = multimodal_processing(adatas = cv_adatas, 115 names = dummy_names, 116 tfidf = tfidf_list, 117 batch_size= batch_size, 118 batches = batches) 119 120 fold_cv_adata.uns['seed_obj'] = cv_adatas[0].uns['seed_obj'] 121 122 if 'sigma' in fold_cv_adata.uns_keys(): 123 del fold_cv_adata.uns['sigma'] 124 125 # In train_model we index Z_train for balancing multiclass labels. We just recreate 126 # dummy indices here that are unused for use in the binary case 127 fold_cv_adata.uns['train_indices'] = np.arange(0, len(fold_train)) 128 129 gc.collect() 130 131 for j, alpha in enumerate(alpha_array): 132 133 fold_cv_adata = train_model(fold_cv_adata, group_size, alpha = alpha) 134 135 _, metrics = predict(fold_cv_adata, metrics = [metric]) 136 metric_array[j, fold] = metrics[metric] 137 138 del fold_cv_adata 139 gc.collect() 140 141 # Take AUROC mean across the k folds and select the alpha resulting in highest AUROC 142 alpha_star = alpha_array[np.argmax(np.mean(metric_array, axis = 1))] 143 del cv_adatas 144 gc.collect() 145 146 return alpha_star 147 148 149def optimize_alpha(adata: ad.AnnData | list[ad.AnnData], 150 group_size: int | None=None, 151 tfidf: bool | list[bool]=False, 152 alpha_array: np.ndarray=np.round(np.linspace(1.9,0.1, 10),2), 153 k: int=4, metric: str='AUROC', 154 batches: int=10, batch_size: int=100): 155 """ 156 Iteratively train a grouplasso model and update alpha to find the 157 parameter yielding best performing sparsity. This function 158 currently only works for binary experiments. 159 160 Parameters 161 ---------- 162 adata : ad.AnnData | list[ad.AnnData] 163 `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 164 `adata.uns.keys()`. 165 166 group_size : None | int 167 Argument describing how the features are grouped. If `None`, 168 `2 * adata.uns['D']` will be used. For more information see 169 [celer documentation](https://mathurinm.github.io/celer/ 170 generated/celer.GroupLasso.html). 171 172 tfidf : list | bool 173 If `False`, no data will be TF-IDF transformed. If 174 `type(adata) is list` and TF-IDF transformation is desired for 175 all or some of the data, a bool list corresponding to `adata` 176 must be provided. To simply TF-IDF transform `adata` when 177 `type(adata) is ad.AnnData`, use `True`. 178 179 alpha_array : np.ndarray 180 Array of all alpha values to be tested. 181 182 k : int 183 Number of folds to perform cross validation over. 184 185 metric : str 186 Which metric to use to optimize alpha. Options are `'AUROC'`, 187 `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`. 188 189 batches : int 190 The number of batches to use for the distance calculation. 191 This will average the result of `batches` distance calculations 192 of `batch_size` randomly sampled cells. More batches will converge 193 to population distance values at the cost of scalability. 194 195 batch_size : int 196 The number of cells to include per batch for distance 197 calculations. Higher batch size will converge to population 198 distance values at the cost of scalability. If 199 `batches*batch_size > num_training_cells`, `batch_size` will be 200 reduced to `int(num_training_cells/batches)`. 201 202 Returns 203 ------- 204 alpha_star : float 205 The best performing alpha value from cross validation on 206 training data. 207 208 Examples 209 -------- 210 >>> alpha_star = scmkl.optimize_alpha(adata) 211 >>> alpha_star 212 0.1 213 """ 214 assert isinstance(k, int) and k > 0, "'k' must be positive" 215 216 import warnings 217 warnings.filterwarnings('ignore') 218 219 if group_size == None: 220 group_size = adata.uns['D']*2 221 222 if type(adata) == list: 223 alpha_star = multimodal_optimize_alpha(adatas = adata, 224 group_size = group_size, 225 tfidf_list = tfidf, 226 alpha_array = alpha_array, 227 metric = metric, 228 batch_size = batch_size, 229 batches = batches) 230 return alpha_star 231 232 y = adata.obs['labels'].iloc[adata.uns['train_indices']].to_numpy() 233 234 # Splits the labels evenly between folds 235 positive_indices = np.where(y == np.unique(y)[0])[0] 236 negative_indices = np.setdiff1d(np.arange(len(y)), positive_indices) 237 238 positive_annotations = np.arange(len(positive_indices)) % k 239 negative_annotations = np.arange(len(negative_indices)) % k 240 241 metric_array = np.zeros((len(alpha_array), k)) 242 243 gc.collect() 244 245 for fold in np.arange(k): 246 247 cv_adata = adata[adata.uns['train_indices'],:] 248 249 if 'sigma' in cv_adata.uns_keys(): 250 del cv_adata.uns['sigma'] 251 252 # Create CV train/test indices 253 fold_train = np.concatenate((positive_indices[np.where(positive_annotations != fold)[0]], 254 negative_indices[np.where(negative_annotations != fold)[0]])) 255 fold_test = np.concatenate((positive_indices[np.where(positive_annotations == fold)[0]], 256 negative_indices[np.where(negative_annotations == fold)[0]])) 257 258 cv_adata.uns['train_indices'] = fold_train 259 cv_adata.uns['test_indices'] = fold_test 260 261 if tfidf: 262 cv_adata = tfidf_normalize(cv_adata, binarize= True) 263 264 # Estimating kernel widths and calculating Zs 265 cv_adata = calculate_z(cv_adata, n_features= 5000, 266 batches = batches, batch_size = batch_size) 267 268 # In train_model we index Z_train for balancing multiclass labels. We just recreate 269 # dummy indices here that are unused for use in the binary case 270 cv_adata.uns['train_indices'] = np.arange(0, len(fold_train)) 271 272 gc.collect() 273 274 for i, alpha in enumerate(alpha_array): 275 276 cv_adata = train_model(cv_adata, group_size, alpha = alpha) 277 _, metrics = predict(cv_adata, metrics = [metric]) 278 metric_array[i, fold] = metrics[metric] 279 280 gc.collect() 281 282 del cv_adata 283 gc.collect() 284 285 # Take AUROC mean across the k folds to find alpha yielding highest AUROC 286 alpha_star = alpha_array[np.argmax(np.mean(metric_array, axis = 1))] 287 gc.collect() 288 289 290 return alpha_star
def
multimodal_optimize_alpha( adatas: list[anndata._core.anndata.AnnData], group_size: int, tfidf_list: list | bool = False, alpha_array: numpy.ndarray = array([1.9, 1.7, 1.5, 1.3, 1.1, 0.9, 0.7, 0.5, 0.3, 0.1]), k: int = 4, metric: str = 'AUROC', batches: int = 10, batch_size: int = 100):
15def multimodal_optimize_alpha(adatas: list[ad.AnnData], group_size: int, 16 tfidf_list: list | bool=False, 17 alpha_array: np.ndarray=np.round(np.linspace(1.9,0.1, 10),2), 18 k: int=4, metric: str='AUROC', 19 batches: int=10, batch_size: int=100): 20 """ 21 Iteratively train a grouplasso model and update alpha to find the 22 parameter yielding the desired sparsity. Meant to find a good 23 starting point for your model, and the alpha may need further fine 24 tuning. 25 26 Parameters 27 ---------- 28 adatas : list[ad.AnnData] 29 Objects of type `ad.AnnData` where each object is one modality 30 and Z_train and Z_test are calculated 31 32 group_size : None | int 33 Argument describing how the features are grouped. If `None`, 34 `2 * adata.uns['D']` will be used. For more information see 35 [celer documentation](https://mathurinm.github.io/celer/ 36 generated/celer.GroupLasso.html). 37 38 tfidf_list : list | None 39 A boolean mask where `tfidf_list[i]` is respective to 40 `adatas[i]`. If `True`, TF-IDF normalization will be applied to 41 the respective `ad.AnnData` during cross validation 42 43 alpha_array : np.ndarray 44 All alpha values to be tested. 45 46 k : int 47 Number of folds to perform cross validation over. 48 49 metric : str 50 Which metric to use to optimize alpha. Options are `'AUROC'`, 51 `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`. 52 53 batches : int 54 The number of batches to use for the distance calculation. 55 This will average the result of `batches` distance calculations 56 of `batch_size` randomly sampled cells. More batches will converge 57 to population distance values at the cost of scalability. 58 59 batch_size : int 60 The number of cells to include per batch for distance 61 calculations. Higher batch size will converge to population 62 distance values at the cost of scalability. If 63 `batches*batch_size > num_training_cells`, `batch_size` will be 64 reduced to `int(num_training_cells/batches)`. 65 66 Returns 67 ------- 68 alpha_star : float 69 The alpha value yielding the best performing model from cross 70 validation. 71 """ 72 assert isinstance(k, int) and k > 0, 'Must be a positive integer number of folds' 73 74 import warnings 75 warnings.filterwarnings('ignore') 76 77 if not tfidf_list: 78 tfidf_list = [False]*len(adatas) 79 80 y = adatas[0].obs['labels'].iloc[adatas[0].uns['train_indices']].to_numpy() 81 82 # Splits the labels evenly between folds 83 positive_indices = np.where(y == np.unique(y)[0])[0] 84 negative_indices = np.setdiff1d(np.arange(len(y)), positive_indices) 85 86 positive_annotations = np.arange(len(positive_indices)) % k 87 negative_annotations = np.arange(len(negative_indices)) % k 88 89 metric_array = np.zeros((len(alpha_array), k)) 90 91 cv_adatas = [] 92 93 for adata in adatas: 94 cv_adatas.append(adata[adata.uns['train_indices'],:].copy()) 95 96 del adatas 97 gc.collect() 98 99 for fold in np.arange(k): 100 101 fold_train = np.concatenate((positive_indices[np.where(positive_annotations != fold)[0]], 102 negative_indices[np.where(negative_annotations != fold)[0]])) 103 fold_test = np.concatenate((positive_indices[np.where(positive_annotations == fold)[0]], 104 negative_indices[np.where(negative_annotations == fold)[0]])) 105 106 for i in range(len(cv_adatas)): 107 cv_adatas[i].uns['train_indices'] = fold_train 108 cv_adatas[i].uns['test_indices'] = fold_test 109 110 # Creating dummy names for cv. 111 # Necessary for interpretability but not for AUROC cv 112 dummy_names = [f'adata {i}' for i in range(len(cv_adatas))] 113 114 # Calculate the Z's for each modality independently 115 fold_cv_adata = multimodal_processing(adatas = cv_adatas, 116 names = dummy_names, 117 tfidf = tfidf_list, 118 batch_size= batch_size, 119 batches = batches) 120 121 fold_cv_adata.uns['seed_obj'] = cv_adatas[0].uns['seed_obj'] 122 123 if 'sigma' in fold_cv_adata.uns_keys(): 124 del fold_cv_adata.uns['sigma'] 125 126 # In train_model we index Z_train for balancing multiclass labels. We just recreate 127 # dummy indices here that are unused for use in the binary case 128 fold_cv_adata.uns['train_indices'] = np.arange(0, len(fold_train)) 129 130 gc.collect() 131 132 for j, alpha in enumerate(alpha_array): 133 134 fold_cv_adata = train_model(fold_cv_adata, group_size, alpha = alpha) 135 136 _, metrics = predict(fold_cv_adata, metrics = [metric]) 137 metric_array[j, fold] = metrics[metric] 138 139 del fold_cv_adata 140 gc.collect() 141 142 # Take AUROC mean across the k folds and select the alpha resulting in highest AUROC 143 alpha_star = alpha_array[np.argmax(np.mean(metric_array, axis = 1))] 144 del cv_adatas 145 gc.collect() 146 147 return alpha_star
Iteratively train a grouplasso model and update alpha to find the parameter yielding the desired sparsity. Meant to find a good starting point for your model, and the alpha may need further fine tuning.
Parameters
- adatas (list[ad.AnnData]):
Objects of type
ad.AnnData
where each object is one modality and Z_train and Z_test are calculated - group_size (None | int):
Argument describing how the features are grouped. If
None
,2 * adata.uns['D']
will be used. For more information see celer documentation. - tfidf_list (list | None):
A boolean mask where
tfidf_list[i]
is respective toadatas[i]
. IfTrue
, TF-IDF normalization will be applied to the respectivead.AnnData
during cross validation - alpha_array (np.ndarray): All alpha values to be tested.
- k (int): Number of folds to perform cross validation over.
- metric (str):
Which metric to use to optimize alpha. Options are
'AUROC'
,'Accuracy'
,'F1-Score'
,'Precision'
, and'Recall'
. - batches (int):
The number of batches to use for the distance calculation.
This will average the result of
batches
distance calculations ofbatch_size
randomly sampled cells. More batches will converge to population distance values at the cost of scalability. - batch_size (int):
The number of cells to include per batch for distance
calculations. Higher batch size will converge to population
distance values at the cost of scalability. If
batches*batch_size > num_training_cells
,batch_size
will be reduced toint(num_training_cells/batches)
.
Returns
- alpha_star (float): The alpha value yielding the best performing model from cross validation.
def
optimize_alpha( adata: anndata._core.anndata.AnnData | list[anndata._core.anndata.AnnData], group_size: int | None = None, tfidf: bool | list[bool] = False, alpha_array: numpy.ndarray = array([1.9, 1.7, 1.5, 1.3, 1.1, 0.9, 0.7, 0.5, 0.3, 0.1]), k: int = 4, metric: str = 'AUROC', batches: int = 10, batch_size: int = 100):
150def optimize_alpha(adata: ad.AnnData | list[ad.AnnData], 151 group_size: int | None=None, 152 tfidf: bool | list[bool]=False, 153 alpha_array: np.ndarray=np.round(np.linspace(1.9,0.1, 10),2), 154 k: int=4, metric: str='AUROC', 155 batches: int=10, batch_size: int=100): 156 """ 157 Iteratively train a grouplasso model and update alpha to find the 158 parameter yielding best performing sparsity. This function 159 currently only works for binary experiments. 160 161 Parameters 162 ---------- 163 adata : ad.AnnData | list[ad.AnnData] 164 `ad.AnnData`(s) with `'Z_train'` and `'Z_test'` in 165 `adata.uns.keys()`. 166 167 group_size : None | int 168 Argument describing how the features are grouped. If `None`, 169 `2 * adata.uns['D']` will be used. For more information see 170 [celer documentation](https://mathurinm.github.io/celer/ 171 generated/celer.GroupLasso.html). 172 173 tfidf : list | bool 174 If `False`, no data will be TF-IDF transformed. If 175 `type(adata) is list` and TF-IDF transformation is desired for 176 all or some of the data, a bool list corresponding to `adata` 177 must be provided. To simply TF-IDF transform `adata` when 178 `type(adata) is ad.AnnData`, use `True`. 179 180 alpha_array : np.ndarray 181 Array of all alpha values to be tested. 182 183 k : int 184 Number of folds to perform cross validation over. 185 186 metric : str 187 Which metric to use to optimize alpha. Options are `'AUROC'`, 188 `'Accuracy'`, `'F1-Score'`, `'Precision'`, and `'Recall'`. 189 190 batches : int 191 The number of batches to use for the distance calculation. 192 This will average the result of `batches` distance calculations 193 of `batch_size` randomly sampled cells. More batches will converge 194 to population distance values at the cost of scalability. 195 196 batch_size : int 197 The number of cells to include per batch for distance 198 calculations. Higher batch size will converge to population 199 distance values at the cost of scalability. If 200 `batches*batch_size > num_training_cells`, `batch_size` will be 201 reduced to `int(num_training_cells/batches)`. 202 203 Returns 204 ------- 205 alpha_star : float 206 The best performing alpha value from cross validation on 207 training data. 208 209 Examples 210 -------- 211 >>> alpha_star = scmkl.optimize_alpha(adata) 212 >>> alpha_star 213 0.1 214 """ 215 assert isinstance(k, int) and k > 0, "'k' must be positive" 216 217 import warnings 218 warnings.filterwarnings('ignore') 219 220 if group_size == None: 221 group_size = adata.uns['D']*2 222 223 if type(adata) == list: 224 alpha_star = multimodal_optimize_alpha(adatas = adata, 225 group_size = group_size, 226 tfidf_list = tfidf, 227 alpha_array = alpha_array, 228 metric = metric, 229 batch_size = batch_size, 230 batches = batches) 231 return alpha_star 232 233 y = adata.obs['labels'].iloc[adata.uns['train_indices']].to_numpy() 234 235 # Splits the labels evenly between folds 236 positive_indices = np.where(y == np.unique(y)[0])[0] 237 negative_indices = np.setdiff1d(np.arange(len(y)), positive_indices) 238 239 positive_annotations = np.arange(len(positive_indices)) % k 240 negative_annotations = np.arange(len(negative_indices)) % k 241 242 metric_array = np.zeros((len(alpha_array), k)) 243 244 gc.collect() 245 246 for fold in np.arange(k): 247 248 cv_adata = adata[adata.uns['train_indices'],:] 249 250 if 'sigma' in cv_adata.uns_keys(): 251 del cv_adata.uns['sigma'] 252 253 # Create CV train/test indices 254 fold_train = np.concatenate((positive_indices[np.where(positive_annotations != fold)[0]], 255 negative_indices[np.where(negative_annotations != fold)[0]])) 256 fold_test = np.concatenate((positive_indices[np.where(positive_annotations == fold)[0]], 257 negative_indices[np.where(negative_annotations == fold)[0]])) 258 259 cv_adata.uns['train_indices'] = fold_train 260 cv_adata.uns['test_indices'] = fold_test 261 262 if tfidf: 263 cv_adata = tfidf_normalize(cv_adata, binarize= True) 264 265 # Estimating kernel widths and calculating Zs 266 cv_adata = calculate_z(cv_adata, n_features= 5000, 267 batches = batches, batch_size = batch_size) 268 269 # In train_model we index Z_train for balancing multiclass labels. We just recreate 270 # dummy indices here that are unused for use in the binary case 271 cv_adata.uns['train_indices'] = np.arange(0, len(fold_train)) 272 273 gc.collect() 274 275 for i, alpha in enumerate(alpha_array): 276 277 cv_adata = train_model(cv_adata, group_size, alpha = alpha) 278 _, metrics = predict(cv_adata, metrics = [metric]) 279 metric_array[i, fold] = metrics[metric] 280 281 gc.collect() 282 283 del cv_adata 284 gc.collect() 285 286 # Take AUROC mean across the k folds to find alpha yielding highest AUROC 287 alpha_star = alpha_array[np.argmax(np.mean(metric_array, axis = 1))] 288 gc.collect() 289 290 291 return alpha_star
Iteratively train a grouplasso model and update alpha to find the parameter yielding best performing sparsity. This function currently only works for binary experiments.
Parameters
- adata (ad.AnnData | list[ad.AnnData]):
ad.AnnData
(s) with'Z_train'
and'Z_test'
inadata.uns.keys()
. - group_size (None | int):
Argument describing how the features are grouped. If
None
,2 * adata.uns['D']
will be used. For more information see celer documentation. - tfidf (list | bool):
If
False
, no data will be TF-IDF transformed. Iftype(adata) is list
and TF-IDF transformation is desired for all or some of the data, a bool list corresponding toadata
must be provided. To simply TF-IDF transformadata
whentype(adata) is ad.AnnData
, useTrue
. - alpha_array (np.ndarray): Array of all alpha values to be tested.
- k (int): Number of folds to perform cross validation over.
- metric (str):
Which metric to use to optimize alpha. Options are
'AUROC'
,'Accuracy'
,'F1-Score'
,'Precision'
, and'Recall'
. - batches (int):
The number of batches to use for the distance calculation.
This will average the result of
batches
distance calculations ofbatch_size
randomly sampled cells. More batches will converge to population distance values at the cost of scalability. - batch_size (int):
The number of cells to include per batch for distance
calculations. Higher batch size will converge to population
distance values at the cost of scalability. If
batches*batch_size > num_training_cells
,batch_size
will be reduced toint(num_training_cells/batches)
.
Returns
- alpha_star (float): The best performing alpha value from cross validation on training data.
Examples
>>> alpha_star = scmkl.optimize_alpha(adata)
>>> alpha_star
0.1