scmkl.optimize_alpha
1import numpy as np 2import gc 3import tracemalloc 4 5from scmkl.tfidf_normalize import tfidf_normalize 6from scmkl.estimate_sigma import estimate_sigma 7from scmkl.calculate_z import calculate_z 8from scmkl.train_model import train_model 9from scmkl.multimodal_processing import multimodal_processing 10from scmkl.test import predict 11 12 13def _multimodal_optimize_alpha(adatas : list, group_size = 1, tfidf = [False, False], 14 alpha_array = np.round(np.linspace(1.9,0.1, 10),2), k = 4, 15 metric = 'AUROC'): 16 ''' 17 Iteratively train a grouplasso model and update alpha to find the parameter yielding the desired sparsity. 18 This function is meant to find a good starting point for your model, and the alpha may need further fine tuning. 19 Input: 20 adatas- a list of AnnData objects where each object is one modality and Z_train and Z_test are calculated 21 group_size- Argument describing how the features are grouped. 22 From Celer documentation: 23 "groupsint | list of ints | list of lists of ints. 24 Partition of features used in the penalty on w. 25 If an int is passed, groups are contiguous blocks of features, of size groups. 26 If a list of ints is passed, groups are assumed to be contiguous, group number g being of size groups[g]. 27 If a list of lists of ints is passed, groups[g] contains the feature indices of the group number g." 28 If 1, model will behave identically to Lasso Regression. 29 tifidf_list- a boolean mask where tfidf_list[0] and tfidf_list[1] are respective to adata1 and adata2 30 If True, tfidf normalization will be applied to the respective adata during cross validation 31 starting_alpha- The alpha value to start the search at. 32 alpha_array- Numpy array of all alpha values to be tested 33 k- number of folds to perform cross validation over 34 35 Output: 36 sparsity_dict- Dictionary with tested alpha as keys and the number of selected pathways as the values 37 alpha- The alpha value yielding the number of selected groups closest to the target. 38 ''' 39 40 assert isinstance(k, int) and k > 0, 'Must be a positive integer number of folds' 41 42 import warnings 43 warnings.filterwarnings('ignore') 44 45 y = adatas[0].obs['labels'].iloc[adatas[0].uns['train_indices']].to_numpy() 46 47 # Splits the labels evenly between folds 48 positive_indices = np.where(y == np.unique(y)[0])[0] 49 negative_indices = np.setdiff1d(np.arange(len(y)), positive_indices) 50 51 positive_annotations = np.arange(len(positive_indices)) % k 52 negative_annotations = np.arange(len(negative_indices)) % k 53 54 metric_array = np.zeros((len(alpha_array), k)) 55 56 cv_adatas = [] 57 58 for adata in adatas: 59 cv_adatas.append(adata[adata.uns['train_indices'],:].copy()) 60 61 del adatas 62 gc.collect() 63 64 for fold in np.arange(k): 65 66 print(f'Fold {fold + 1}:\n Memory Usage: {[mem / 1e9 for mem in tracemalloc.get_traced_memory()]} GB') 67 68 fold_train = np.concatenate((positive_indices[np.where(positive_annotations != fold)[0]], negative_indices[np.where(negative_annotations != fold)[0]])) 69 fold_test = np.concatenate((positive_indices[np.where(positive_annotations == fold)[0]], negative_indices[np.where(negative_annotations == fold)[0]])) 70 71 for i in range(len(cv_adatas)): 72 cv_adatas[i].uns['train_indices'] = fold_train 73 cv_adatas[i].uns['test_indices'] = fold_test 74 75 # Creating dummy names for cv. 76 # #Necessary for interpretability but not for AUROC cv 77 dummy_names = [f'adata {i}' for i in range(len(cv_adatas))] 78 79 # Calculate the Z's for each modality independently 80 fold_cv_adata = multimodal_processing(adatas = cv_adatas, names = dummy_names, tfidf = tfidf) 81 fold_cv_adata.uns['seed_obj'] = cv_adatas[0].uns['seed_obj'] 82 83 gc.collect() 84 85 for j, alpha in enumerate(alpha_array): 86 87 fold_cv_adata = train_model(fold_cv_adata, group_size, alpha = alpha) 88 89 _, metrics = predict(fold_cv_adata, metrics = [metric]) 90 metric_array[j, fold] = metrics[metric] 91 92 del fold_cv_adata 93 gc.collect() 94 95 # Take AUROC mean across the k folds and select the alpha resulting in highest AUROC 96 alpha_star = alpha_array[np.argmax(np.mean(metric_array, axis = 1))] 97 del cv_adatas 98 gc.collect() 99 100 return alpha_star 101 102 103def optimize_alpha(adata, group_size = None, tfidf = False, 104 alpha_array = np.round(np.linspace(1.9,0.1, 10),2), 105 k = 4, metric = 'AUROC'): 106 ''' 107 Iteratively train a grouplasso model and update alpha to find the 108 parameter yielding best performing sparsity. This function 109 currently only works for binary experiments. 110 111 Parameters 112 ---------- 113 **adata** : *AnnData* | *list[AnnData]* 114 > `AnnData`(s) with `'Z_train'` and `'Z_test'` in 115 `adata.uns.keys()`. 116 117 **group_size** : *None* | *int* 118 > Argument describing how the features are grouped. If *None*, 119 `2 * adata.uns['D'] will be used. 120 For more information see 121 [celer documentation](https://mathurinm.github.io/celer/generated/celer.GroupLasso.html). 122 123 **tfidf** : *bool* 124 > If `True`, TFIDF normalization will be run at each fold. 125 126 **alpha_array** : *np.ndarray* 127 > Array of all alpha values to be tested. 128 129 **k** : *int* 130 > Number of folds to perform cross validation over. 131 132 **metric**: *str* 133 > Which metric to use to optimize alpha. Options 134 are `'AUROC'`, `'Accuracy'`, `'F1-Score'`, `'Precision'`, and 135 `'Recall'` 136 137 Returns 138 ------- 139 **alpha_star** : *int* 140 > The best performing alpha value from cross validation on 141 training data. 142 143 Examples 144 -------- 145 >>> alpha_star = scmkl.optimize_alpha(adata) 146 >>> alpha_star 147 0.1 148 ''' 149 150 assert isinstance(k, int) and k > 0, 'Must be a positive integer number of folds' 151 152 import warnings 153 warnings.filterwarnings('ignore') 154 155 if group_size == None: 156 group_size = adata.uns['D'] * 2 157 158 if type(adata) == list: 159 alpha_star = _multimodal_optimize_alpha(adatas = adata, group_size = group_size, tfidf = tfidf, alpha_array = alpha_array, metric = metric) 160 return alpha_star 161 162 y = adata.obs['labels'].iloc[adata.uns['train_indices']].to_numpy() 163 164 # Splits the labels evenly between folds 165 positive_indices = np.where(y == np.unique(y)[0])[0] 166 negative_indices = np.setdiff1d(np.arange(len(y)), positive_indices) 167 168 positive_annotations = np.arange(len(positive_indices)) % k 169 negative_annotations = np.arange(len(negative_indices)) % k 170 171 metric_array = np.zeros((len(alpha_array), k)) 172 173 gc.collect() 174 175 for fold in np.arange(k): 176 177 cv_adata = adata[adata.uns['train_indices'],:] 178 179 # Create CV train/test indices 180 fold_train = np.concatenate((positive_indices[np.where(positive_annotations != fold)[0]], 181 negative_indices[np.where(negative_annotations != fold)[0]])) 182 fold_test = np.concatenate((positive_indices[np.where(positive_annotations == fold)[0]], 183 negative_indices[np.where(negative_annotations == fold)[0]])) 184 185 cv_adata.uns['train_indices'] = fold_train 186 cv_adata.uns['test_indices'] = fold_test 187 188 if tfidf: 189 cv_adata = tfidf_normalize(cv_adata, binarize= True) 190 191 cv_adata = estimate_sigma(cv_adata, n_features = 200) 192 cv_adata = calculate_z(cv_adata, n_features= 5000) 193 194 gc.collect() 195 196 for i, alpha in enumerate(alpha_array): 197 198 cv_adata = train_model(cv_adata, group_size, alpha = alpha) 199 _, metrics = predict(cv_adata, metrics = [metric]) 200 metric_array[i, fold] = metrics[metric] 201 202 gc.collect() 203 204 del cv_adata 205 gc.collect() 206 207 # Take AUROC mean across the k folds to find alpha yielding highest AUROC 208 alpha_star = alpha_array[np.argmax(np.mean(metric_array, axis = 1))] 209 gc.collect() 210 211 212 return alpha_star
104def optimize_alpha(adata, group_size = None, tfidf = False, 105 alpha_array = np.round(np.linspace(1.9,0.1, 10),2), 106 k = 4, metric = 'AUROC'): 107 ''' 108 Iteratively train a grouplasso model and update alpha to find the 109 parameter yielding best performing sparsity. This function 110 currently only works for binary experiments. 111 112 Parameters 113 ---------- 114 **adata** : *AnnData* | *list[AnnData]* 115 > `AnnData`(s) with `'Z_train'` and `'Z_test'` in 116 `adata.uns.keys()`. 117 118 **group_size** : *None* | *int* 119 > Argument describing how the features are grouped. If *None*, 120 `2 * adata.uns['D'] will be used. 121 For more information see 122 [celer documentation](https://mathurinm.github.io/celer/generated/celer.GroupLasso.html). 123 124 **tfidf** : *bool* 125 > If `True`, TFIDF normalization will be run at each fold. 126 127 **alpha_array** : *np.ndarray* 128 > Array of all alpha values to be tested. 129 130 **k** : *int* 131 > Number of folds to perform cross validation over. 132 133 **metric**: *str* 134 > Which metric to use to optimize alpha. Options 135 are `'AUROC'`, `'Accuracy'`, `'F1-Score'`, `'Precision'`, and 136 `'Recall'` 137 138 Returns 139 ------- 140 **alpha_star** : *int* 141 > The best performing alpha value from cross validation on 142 training data. 143 144 Examples 145 -------- 146 >>> alpha_star = scmkl.optimize_alpha(adata) 147 >>> alpha_star 148 0.1 149 ''' 150 151 assert isinstance(k, int) and k > 0, 'Must be a positive integer number of folds' 152 153 import warnings 154 warnings.filterwarnings('ignore') 155 156 if group_size == None: 157 group_size = adata.uns['D'] * 2 158 159 if type(adata) == list: 160 alpha_star = _multimodal_optimize_alpha(adatas = adata, group_size = group_size, tfidf = tfidf, alpha_array = alpha_array, metric = metric) 161 return alpha_star 162 163 y = adata.obs['labels'].iloc[adata.uns['train_indices']].to_numpy() 164 165 # Splits the labels evenly between folds 166 positive_indices = np.where(y == np.unique(y)[0])[0] 167 negative_indices = np.setdiff1d(np.arange(len(y)), positive_indices) 168 169 positive_annotations = np.arange(len(positive_indices)) % k 170 negative_annotations = np.arange(len(negative_indices)) % k 171 172 metric_array = np.zeros((len(alpha_array), k)) 173 174 gc.collect() 175 176 for fold in np.arange(k): 177 178 cv_adata = adata[adata.uns['train_indices'],:] 179 180 # Create CV train/test indices 181 fold_train = np.concatenate((positive_indices[np.where(positive_annotations != fold)[0]], 182 negative_indices[np.where(negative_annotations != fold)[0]])) 183 fold_test = np.concatenate((positive_indices[np.where(positive_annotations == fold)[0]], 184 negative_indices[np.where(negative_annotations == fold)[0]])) 185 186 cv_adata.uns['train_indices'] = fold_train 187 cv_adata.uns['test_indices'] = fold_test 188 189 if tfidf: 190 cv_adata = tfidf_normalize(cv_adata, binarize= True) 191 192 cv_adata = estimate_sigma(cv_adata, n_features = 200) 193 cv_adata = calculate_z(cv_adata, n_features= 5000) 194 195 gc.collect() 196 197 for i, alpha in enumerate(alpha_array): 198 199 cv_adata = train_model(cv_adata, group_size, alpha = alpha) 200 _, metrics = predict(cv_adata, metrics = [metric]) 201 metric_array[i, fold] = metrics[metric] 202 203 gc.collect() 204 205 del cv_adata 206 gc.collect() 207 208 # Take AUROC mean across the k folds to find alpha yielding highest AUROC 209 alpha_star = alpha_array[np.argmax(np.mean(metric_array, axis = 1))] 210 gc.collect() 211 212 213 return alpha_star
Iteratively train a grouplasso model and update alpha to find the parameter yielding best performing sparsity. This function currently only works for binary experiments.
Parameters
adata : AnnData | list[AnnData]
AnnData
(s) with'Z_train'
and'Z_test'
inadata.uns.keys()
.
group_size : None | int
Argument describing how the features are grouped. If None, `2 * adata.uns['D'] will be used. For more information see celer documentation.
tfidf : bool
If
True
, TFIDF normalization will be run at each fold.
alpha_array : np.ndarray
Array of all alpha values to be tested.
k : int
Number of folds to perform cross validation over.
metric: str
Which metric to use to optimize alpha. Options are
'AUROC'
,'Accuracy'
,'F1-Score'
,'Precision'
, and'Recall'
Returns
alpha_star : int
The best performing alpha value from cross validation on training data.
Examples
>>> alpha_star = scmkl.optimize_alpha(adata)
>>> alpha_star
0.1