scmkl.optimize_alpha
1import numpy as np 2import gc 3import tracemalloc 4import sklearn 5 6from scmkl.tfidf_normalize import tfidf_normalize 7from scmkl.estimate_sigma import estimate_sigma 8from scmkl.calculate_z import calculate_z 9from scmkl.train_model import train_model 10from scmkl.multimodal_processing import multimodal_processing 11 12 13def _calculate_auroc(adata)-> float: 14 ''' 15 Function to calculate the AUROC for a classification. 16 Designed as a helper function. Recommended to use Predict() for 17 model evaluation. 18 19 Parameters 20 ---------- 21 adata : adata object with trained model and Z matrices in 22 uns 23 24 Returns 25 ------- 26 Calculated AUROC value 27 ''' 28 y_test = adata.obs['labels'].iloc[adata.uns['test_indices']].to_numpy() 29 y_test = y_test.ravel() 30 X_test = adata.uns['Z_test'] 31 32 assert X_test.shape[0] == len(y_test), (f"X has {X_test.shape[0]} " 33 "samples and y has " 34 f"{len(y_test)} samples.") 35 36 # Sigmoid function to force probabilities into [0,1] 37 probabilities = 1 / (1 + np.exp(-adata.uns['model'].predict(X_test))) 38 39 # Group Lasso requires 'continous' y values need to re-descritize it 40 y = np.zeros((len(y_test))) 41 y[y_test == np.unique(y_test)[0]] = 1 42 fpr, tpr, _ = sklearn.metrics.roc_curve(y, probabilities) 43 auc = sklearn.metrics.auc(fpr, tpr) 44 45 return(auc) 46 47 48def _multimodal_optimize_alpha(adatas : list, group_size = 1, 49 tfidf = [False, False], 50 alpha_array = np.linspace(1.9, 0.1, 10), 51 k = 4): 52 ''' 53 Iteratively train a grouplasso model and update alpha to find the 54 parameter yielding the desired sparsity. This function is meant to 55 find a good starting point for your model, and the alpha may need 56 further fine tuning. 57 58 Parameters 59 ---------- 60 adatas : a list of AnnData objects where each object is one 61 modality and Z_train and Z_test are calculated 62 63 group_size : Argument describing how the features are grouped. 64 From Celer documentation: 65 "groupsint | list of ints | list of lists of ints. 66 Partition of features used in the penalty on w. 67 If an int is passed, groups are contiguous blocks of 68 features, of size groups. 69 If a list of ints is passed, groups are assumed to be 70 contiguous, group number g being of size groups[g]. 71 If a list of lists of ints is passed, groups[g] 72 contains the feature indices of the group number g." 73 If 1, model will behave identically to Lasso Regression. 74 75 tifidf_list : a boolean mask where tfidf_list[0] and tfidf_list[1] 76 are respective to adata1 and adata2. If True, tfidf 77 normalization will be applied to the respective 78 adata during cross validation 79 80 starting_alpha : The alpha value to start the search at. 81 82 alpha_array : Numpy array of all alpha values to be tested 83 84 k : number of folds to perform cross validation over 85 86 Output: 87 sparsity_dict : Dictionary with tested alpha as keys and the 88 number of selected pathways as the values 89 alpha : The alpha value yielding the number of selected groups 90 closest to the target. 91 ''' 92 93 assert isinstance(k, int) and k > 0, ("Must be a positive integer " 94 "number of folds") 95 96 if np.array_equal(alpha_array, np.linspace(1.9,0.1, 10)): 97 alpha_array = np.round(alpha_array, 2) 98 99 import warnings 100 warnings.filterwarnings('ignore') 101 102 y = adatas[0].obs['labels'].iloc[adatas[0].uns['train_indices']] 103 y = y.to_numpy() 104 105 # Splits the labels evenly between folds 106 pos_indices = np.where(y == np.unique(y)[0])[0] 107 neg_indices = np.setdiff1d(np.arange(len(y)), pos_indices) 108 109 pos_annotations = np.arange(len(pos_indices)) % k 110 neg_annotations = np.arange(len(neg_indices)) % k 111 112 auc_array = np.zeros((len(alpha_array), k)) 113 114 cv_adatas = [] 115 116 for adata in adatas: 117 cv_adatas.append(adata[adata.uns['train_indices'],:].copy()) 118 119 del adatas 120 gc.collect() 121 122 for fold in np.arange(k): 123 124 memory = [mem / 1e9 for mem in tracemalloc.get_traced_memory()] 125 print(f'Fold {fold + 1}:\n Memory Usage: {memory} GB') 126 127 # Capturing train indices for both classes 128 ftrain_pos = pos_indices[np.where(pos_annotations != fold)[0]] 129 ftrain_neg = neg_indices[np.where(neg_annotations != fold)[0]] 130 131 # Capturing test indices for both classes 132 ftest_pos = pos_indices[np.where(pos_annotations == fold)[0]] 133 ftest_neg = neg_indices[np.where(neg_annotations == fold)[0]] 134 135 # Creating class-balanced train/test split for fold 136 fold_train = np.concatenate((ftrain_pos, ftrain_neg)) 137 fold_test = np.concatenate((ftest_pos, ftest_neg)) 138 139 for i in range(len(cv_adatas)): 140 cv_adatas[i].uns['train_indices'] = fold_train 141 cv_adatas[i].uns['test_indices'] = fold_test 142 143 # Creating dummy names for cv. 144 # Necessary for interpretability but not for AUROC cv 145 dummy_names = [f'adata {i}' for i in range(len(cv_adatas))] 146 147 # Calculate the Z's for each modality independently 148 fold_cv_adata = multimodal_processing(adatas = cv_adatas, 149 names = dummy_names, 150 tfidf = tfidf) 151 152 fold_cv_adata.uns['seed_obj'] = cv_adatas[0].uns['seed_obj'] 153 154 gc.collect() 155 156 for j, alpha in enumerate(alpha_array): 157 158 fold_cv_adata = train_model(fold_cv_adata, group_size, 159 alpha = alpha) 160 161 auc_array[j, fold] = _calculate_auroc(fold_cv_adata) 162 163 del fold_cv_adata 164 gc.collect() 165 166 # Take AUROC mean across the k folds 167 # select the alpha resulting in highest AUROC 168 alpha_star = alpha_array[np.argmax(np.mean(auc_array, axis = 1))] 169 del cv_adatas 170 gc.collect() 171 172 return alpha_star 173 174 175def optimize_alpha(adata, group_size = None, tfidf = False, 176 alpha_array = np.round(np.linspace(1.9,0.1, 10),2), k = 4): 177 ''' 178 Iteratively train a grouplasso model and update alpha to find the 179 parameter yielding best performing sparsity. This function 180 currently only works for binary experiments. 181 182 Parameters 183 ---------- 184 **adata** : *AnnData* | *list[AnnData]* 185 > `AnnData`(s) with `'Z_train'` and `'Z_test'` in 186 `adata.uns.keys()`. 187 188 **group_size** : *None* | *int* 189 > Argument describing how the features are grouped. If *None*, 190 `2 * adata.uns['D'] will be used. 191 For more information see celer documentation. 192 193 **tfidf** : *bool* 194 > If `True`, TFIDF normalization will be run at each fold. 195 196 **alpha_array** : *np.ndarray* 197 > Array of all alpha values to be tested. 198 199 **k** : *int* 200 > Number of folds to perform cross validation over. 201 202 Returns 203 ------- 204 **alpha_star** : *int* 205 > The best performing alpha value from cross validation on 206 training data. 207 208 Examples 209 -------- 210 >>> alpha_star = scmkl.optimize_alpha(adata) 211 >>> alpha_star 212 0.1 213 214 See Also 215 -------- 216 celer.GroupLasso : 217 https://mathurinm.github.io/celer/generated/celer.GroupLasso.html 218 ''' 219 220 assert isinstance(k, int) and k > 0, ("Must be a positive integer number " 221 "of folds") 222 223 import warnings 224 warnings.filterwarnings('ignore') 225 226 if group_size == None: 227 group_size = adata.uns['D'] * 2 228 229 if type(adata) == list: 230 alpha_star = _multimodal_optimize_alpha(adatas = adata, 231 group_size = group_size, 232 tfidf = tfidf, 233 alpha_array = alpha_array) 234 return alpha_star 235 236 y = adata.obs['labels'].iloc[adata.uns['train_indices']].to_numpy() 237 238 # Splits the labels evenly between folds 239 pos_indices = np.where(y == np.unique(y)[0])[0] 240 neg_indices = np.setdiff1d(np.arange(len(y)), pos_indices) 241 242 pos_annotations = np.arange(len(pos_indices)) % k 243 neg_annotations = np.arange(len(neg_indices)) % k 244 245 auc_array = np.zeros((len(alpha_array), k)) 246 247 gc.collect() 248 249 for fold in np.arange(k): 250 251 cv_adata = adata[adata.uns['train_indices'],:] 252 253 # Create CV train/test indices 254 # Capturing train indices for both classes 255 ftrain_pos = pos_indices[np.where(pos_annotations != fold)[0]] 256 ftrain_neg = neg_indices[np.where(neg_annotations != fold)[0]] 257 258 # Capturing test indices for both classes 259 ftest_pos = pos_indices[np.where(pos_annotations == fold)[0]] 260 ftest_neg = neg_indices[np.where(neg_annotations == fold)[0]] 261 262 # Creating class-balanced train/test split for fold 263 fold_train = np.concatenate((ftrain_pos, ftrain_neg)) 264 fold_test = np.concatenate((ftest_pos, ftest_neg)) 265 266 cv_adata.uns['train_indices'] = fold_train 267 cv_adata.uns['test_indices'] = fold_test 268 269 if tfidf: 270 cv_adata = tfidf_normalize(cv_adata, binarize= True) 271 272 cv_adata = estimate_sigma(cv_adata, n_features = 200) 273 cv_adata = calculate_z(cv_adata, n_features= 5000) 274 275 gc.collect() 276 277 for i, alpha in enumerate(alpha_array): 278 279 cv_adata = train_model(cv_adata, group_size, alpha = alpha) 280 auc_array[i, fold] = _calculate_auroc(cv_adata) 281 282 gc.collect() 283 284 del cv_adata 285 gc.collect() 286 287 # Take AUROC mean across the k folds to find alpha yielding highest AUROC 288 alpha_star = alpha_array[np.argmax(np.mean(auc_array, axis = 1))] 289 gc.collect() 290 291 292 return alpha_star
176def optimize_alpha(adata, group_size = None, tfidf = False, 177 alpha_array = np.round(np.linspace(1.9,0.1, 10),2), k = 4): 178 ''' 179 Iteratively train a grouplasso model and update alpha to find the 180 parameter yielding best performing sparsity. This function 181 currently only works for binary experiments. 182 183 Parameters 184 ---------- 185 **adata** : *AnnData* | *list[AnnData]* 186 > `AnnData`(s) with `'Z_train'` and `'Z_test'` in 187 `adata.uns.keys()`. 188 189 **group_size** : *None* | *int* 190 > Argument describing how the features are grouped. If *None*, 191 `2 * adata.uns['D'] will be used. 192 For more information see celer documentation. 193 194 **tfidf** : *bool* 195 > If `True`, TFIDF normalization will be run at each fold. 196 197 **alpha_array** : *np.ndarray* 198 > Array of all alpha values to be tested. 199 200 **k** : *int* 201 > Number of folds to perform cross validation over. 202 203 Returns 204 ------- 205 **alpha_star** : *int* 206 > The best performing alpha value from cross validation on 207 training data. 208 209 Examples 210 -------- 211 >>> alpha_star = scmkl.optimize_alpha(adata) 212 >>> alpha_star 213 0.1 214 215 See Also 216 -------- 217 celer.GroupLasso : 218 https://mathurinm.github.io/celer/generated/celer.GroupLasso.html 219 ''' 220 221 assert isinstance(k, int) and k > 0, ("Must be a positive integer number " 222 "of folds") 223 224 import warnings 225 warnings.filterwarnings('ignore') 226 227 if group_size == None: 228 group_size = adata.uns['D'] * 2 229 230 if type(adata) == list: 231 alpha_star = _multimodal_optimize_alpha(adatas = adata, 232 group_size = group_size, 233 tfidf = tfidf, 234 alpha_array = alpha_array) 235 return alpha_star 236 237 y = adata.obs['labels'].iloc[adata.uns['train_indices']].to_numpy() 238 239 # Splits the labels evenly between folds 240 pos_indices = np.where(y == np.unique(y)[0])[0] 241 neg_indices = np.setdiff1d(np.arange(len(y)), pos_indices) 242 243 pos_annotations = np.arange(len(pos_indices)) % k 244 neg_annotations = np.arange(len(neg_indices)) % k 245 246 auc_array = np.zeros((len(alpha_array), k)) 247 248 gc.collect() 249 250 for fold in np.arange(k): 251 252 cv_adata = adata[adata.uns['train_indices'],:] 253 254 # Create CV train/test indices 255 # Capturing train indices for both classes 256 ftrain_pos = pos_indices[np.where(pos_annotations != fold)[0]] 257 ftrain_neg = neg_indices[np.where(neg_annotations != fold)[0]] 258 259 # Capturing test indices for both classes 260 ftest_pos = pos_indices[np.where(pos_annotations == fold)[0]] 261 ftest_neg = neg_indices[np.where(neg_annotations == fold)[0]] 262 263 # Creating class-balanced train/test split for fold 264 fold_train = np.concatenate((ftrain_pos, ftrain_neg)) 265 fold_test = np.concatenate((ftest_pos, ftest_neg)) 266 267 cv_adata.uns['train_indices'] = fold_train 268 cv_adata.uns['test_indices'] = fold_test 269 270 if tfidf: 271 cv_adata = tfidf_normalize(cv_adata, binarize= True) 272 273 cv_adata = estimate_sigma(cv_adata, n_features = 200) 274 cv_adata = calculate_z(cv_adata, n_features= 5000) 275 276 gc.collect() 277 278 for i, alpha in enumerate(alpha_array): 279 280 cv_adata = train_model(cv_adata, group_size, alpha = alpha) 281 auc_array[i, fold] = _calculate_auroc(cv_adata) 282 283 gc.collect() 284 285 del cv_adata 286 gc.collect() 287 288 # Take AUROC mean across the k folds to find alpha yielding highest AUROC 289 alpha_star = alpha_array[np.argmax(np.mean(auc_array, axis = 1))] 290 gc.collect() 291 292 293 return alpha_star
Iteratively train a grouplasso model and update alpha to find the parameter yielding best performing sparsity. This function currently only works for binary experiments.
Parameters
adata : AnnData | list[AnnData]
AnnData
(s) with'Z_train'
and'Z_test'
inadata.uns.keys()
.
group_size : None | int
Argument describing how the features are grouped. If None, `2 * adata.uns['D'] will be used. For more information see celer documentation.
tfidf : bool
If
True
, TFIDF normalization will be run at each fold.
alpha_array : np.ndarray
Array of all alpha values to be tested.
k : int
Number of folds to perform cross validation over.
Returns
alpha_star : int
The best performing alpha value from cross validation on training data.
Examples
>>> alpha_star = scmkl.optimize_alpha(adata)
>>> alpha_star
0.1
See Also
celer.GroupLasso : https://mathurinm.github.io/celer/generated/celer.GroupLasso.html