scmkl.optimize_alpha

  1import numpy as np
  2import gc
  3import tracemalloc
  4import sklearn
  5
  6from scmkl.tfidf_normalize import tfidf_normalize
  7from scmkl.estimate_sigma import estimate_sigma
  8from scmkl.calculate_z import calculate_z
  9from scmkl.train_model import train_model
 10from scmkl.multimodal_processing import multimodal_processing
 11
 12
 13def _calculate_auroc(adata)-> float:
 14    '''
 15    Function to calculate the AUROC for a classification. 
 16    Designed as a helper function.  Recommended to use Predict() for 
 17    model evaluation.
 18    
 19    Parameters
 20    ----------  
 21    adata : adata object with trained model and Z matrices in 
 22            uns
 23
 24    Returns
 25    -------
 26    Calculated AUROC value
 27    '''
 28    y_test = adata.obs['labels'].iloc[adata.uns['test_indices']].to_numpy()
 29    y_test = y_test.ravel()
 30    X_test = adata.uns['Z_test']
 31
 32    assert X_test.shape[0] == len(y_test), (f"X has {X_test.shape[0]} "
 33                                            "samples and y has "
 34                                            f"{len(y_test)} samples.")
 35
 36    # Sigmoid function to force probabilities into [0,1]
 37    probabilities = 1 / (1 + np.exp(-adata.uns['model'].predict(X_test)))
 38    
 39    # Group Lasso requires 'continous' y values need to re-descritize it
 40    y = np.zeros((len(y_test)))
 41    y[y_test == np.unique(y_test)[0]] = 1
 42    fpr, tpr, _ = sklearn.metrics.roc_curve(y, probabilities)
 43    auc = sklearn.metrics.auc(fpr, tpr)
 44    
 45    return(auc)
 46
 47
 48def _multimodal_optimize_alpha(adatas : list, group_size = 1, 
 49                               tfidf = [False, False], 
 50                               alpha_array = np.linspace(1.9, 0.1, 10), 
 51                               k = 4):
 52    '''
 53    Iteratively train a grouplasso model and update alpha to find the 
 54    parameter yielding the desired sparsity. This function is meant to 
 55    find a good starting point for your model, and the alpha may need 
 56    further fine tuning.
 57    
 58    Parameters
 59    ----------
 60    adatas : a list of AnnData objects where each object is one 
 61             modality and Z_train and Z_test are calculated
 62
 63    group_size : Argument describing how the features are grouped. 
 64        From Celer documentation:
 65        "groupsint | list of ints | list of lists of ints.
 66            Partition of features used in the penalty on w. 
 67                If an int is passed, groups are contiguous blocks of 
 68                features, of size groups. 
 69                If a list of ints is passed, groups are assumed to be 
 70                contiguous, group number g being of size groups[g]. 
 71                If a list of lists of ints is passed, groups[g] 
 72                contains the feature indices of the group number g."
 73        If 1, model will behave identically to Lasso Regression.
 74
 75    tifidf_list : a boolean mask where tfidf_list[0] and tfidf_list[1] 
 76                  are respective to adata1 and adata2. If True, tfidf 
 77                  normalization will be applied to the respective 
 78                  adata during cross validation
 79
 80    starting_alpha : The alpha value to start the search at.
 81    
 82    alpha_array : Numpy array of all alpha values to be tested
 83    
 84    k : number of folds to perform cross validation over
 85            
 86    Output:
 87        sparsity_dict : Dictionary with tested alpha as keys and the 
 88                        number of selected pathways as the values
 89        alpha : The alpha value yielding the number of selected groups 
 90                closest to the target.
 91    '''
 92
 93    assert isinstance(k, int) and k > 0, ("Must be a positive integer "
 94                                          "number of folds")
 95
 96    if np.array_equal(alpha_array, np.linspace(1.9,0.1, 10)):
 97        alpha_array = np.round(alpha_array, 2)
 98
 99    import warnings 
100    warnings.filterwarnings('ignore')
101
102    y = adatas[0].obs['labels'].iloc[adatas[0].uns['train_indices']]
103    y = y.to_numpy()
104    
105    # Splits the labels evenly between folds
106    pos_indices = np.where(y == np.unique(y)[0])[0]
107    neg_indices = np.setdiff1d(np.arange(len(y)), pos_indices)
108
109    pos_annotations = np.arange(len(pos_indices)) % k
110    neg_annotations = np.arange(len(neg_indices)) % k
111
112    auc_array = np.zeros((len(alpha_array), k))
113
114    cv_adatas = []
115
116    for adata in adatas:
117        cv_adatas.append(adata[adata.uns['train_indices'],:].copy())
118
119    del adatas
120    gc.collect()
121
122    for fold in np.arange(k):
123        
124        memory = [mem / 1e9 for mem in tracemalloc.get_traced_memory()]
125        print(f'Fold {fold + 1}:\n Memory Usage: {memory} GB')
126
127        # Capturing train indices for both classes
128        ftrain_pos = pos_indices[np.where(pos_annotations != fold)[0]]
129        ftrain_neg = neg_indices[np.where(neg_annotations != fold)[0]]
130
131        # Capturing test indices for both classes
132        ftest_pos = pos_indices[np.where(pos_annotations == fold)[0]]
133        ftest_neg = neg_indices[np.where(neg_annotations == fold)[0]]
134
135        # Creating class-balanced train/test split for fold
136        fold_train = np.concatenate((ftrain_pos, ftrain_neg))
137        fold_test = np.concatenate((ftest_pos, ftest_neg))
138
139        for i in range(len(cv_adatas)):
140            cv_adatas[i].uns['train_indices'] = fold_train
141            cv_adatas[i].uns['test_indices'] = fold_test
142
143        # Creating dummy names for cv. 
144        # Necessary for interpretability but not for AUROC cv
145        dummy_names = [f'adata {i}' for i in range(len(cv_adatas))]
146
147        # Calculate the Z's for each modality independently
148        fold_cv_adata = multimodal_processing(adatas = cv_adatas, 
149                                              names = dummy_names, 
150                                              tfidf = tfidf)
151        
152        fold_cv_adata.uns['seed_obj'] = cv_adatas[0].uns['seed_obj']
153
154        gc.collect()
155
156        for j, alpha in enumerate(alpha_array):
157
158            fold_cv_adata = train_model(fold_cv_adata, group_size, 
159                                        alpha = alpha)
160
161            auc_array[j, fold] = _calculate_auroc(fold_cv_adata)
162
163        del fold_cv_adata
164        gc.collect()
165
166    # Take AUROC mean across the k folds 
167    # select the alpha resulting in highest AUROC
168    alpha_star = alpha_array[np.argmax(np.mean(auc_array, axis = 1))]
169    del cv_adatas
170    gc.collect()
171    
172    return alpha_star
173
174
175def optimize_alpha(adata, group_size = None, tfidf = False, 
176                   alpha_array = np.round(np.linspace(1.9,0.1, 10),2), k = 4):
177    '''
178    Iteratively train a grouplasso model and update alpha to find the 
179    parameter yielding best performing sparsity. This function 
180    currently only works for binary experiments.
181
182    Parameters
183    ----------
184    **adata** : *AnnData* | *list[AnnData]*
185        > `AnnData`(s) with `'Z_train'` and `'Z_test'` in 
186        `adata.uns.keys()`.
187
188    **group_size** : *None* | *int*
189        > Argument describing how the features are grouped. If *None*, 
190        `2 * adata.uns['D'] will be used. 
191        For more information see celer documentation.
192
193    **tfidf** : *bool* 
194        > If `True`, TFIDF normalization will be run at each fold.
195    
196    **alpha_array** : *np.ndarray*
197        > Array of all alpha values to be tested.
198
199    **k** : *int*
200        > Number of folds to perform cross validation over.
201            
202    Returns
203    -------
204    **alpha_star** : *int*
205        > The best performing alpha value from cross validation on 
206        training data.
207
208    Examples
209    --------
210    >>> alpha_star = scmkl.optimize_alpha(adata)
211    >>> alpha_star
212    0.1
213
214    See Also
215    --------
216    celer.GroupLasso : 
217    https://mathurinm.github.io/celer/generated/celer.GroupLasso.html
218    '''
219
220    assert isinstance(k, int) and k > 0, ("Must be a positive integer number "
221                                          "of folds")
222
223    import warnings 
224    warnings.filterwarnings('ignore')
225
226    if group_size == None:
227        group_size = adata.uns['D'] * 2
228
229    if type(adata) == list:
230        alpha_star = _multimodal_optimize_alpha(adatas = adata, 
231                                                group_size = group_size, 
232                                                tfidf = tfidf, 
233                                                alpha_array = alpha_array)
234        return alpha_star
235
236    y = adata.obs['labels'].iloc[adata.uns['train_indices']].to_numpy()
237    
238    # Splits the labels evenly between folds
239    pos_indices = np.where(y == np.unique(y)[0])[0]
240    neg_indices = np.setdiff1d(np.arange(len(y)), pos_indices)
241
242    pos_annotations = np.arange(len(pos_indices)) % k
243    neg_annotations = np.arange(len(neg_indices)) % k
244
245    auc_array = np.zeros((len(alpha_array), k))
246
247    gc.collect()
248
249    for fold in np.arange(k):
250
251        cv_adata = adata[adata.uns['train_indices'],:]
252
253        # Create CV train/test indices
254        # Capturing train indices for both classes
255        ftrain_pos = pos_indices[np.where(pos_annotations != fold)[0]]
256        ftrain_neg = neg_indices[np.where(neg_annotations != fold)[0]]
257
258        # Capturing test indices for both classes
259        ftest_pos = pos_indices[np.where(pos_annotations == fold)[0]]
260        ftest_neg = neg_indices[np.where(neg_annotations == fold)[0]]
261
262        # Creating class-balanced train/test split for fold
263        fold_train = np.concatenate((ftrain_pos, ftrain_neg))
264        fold_test = np.concatenate((ftest_pos, ftest_neg))
265
266        cv_adata.uns['train_indices'] = fold_train
267        cv_adata.uns['test_indices'] = fold_test
268
269        if tfidf:
270            cv_adata = tfidf_normalize(cv_adata, binarize= True)
271
272        cv_adata = estimate_sigma(cv_adata, n_features = 200)
273        cv_adata = calculate_z(cv_adata, n_features= 5000)
274
275        gc.collect()
276
277        for i, alpha in enumerate(alpha_array):
278
279            cv_adata = train_model(cv_adata, group_size, alpha = alpha)
280            auc_array[i, fold] = _calculate_auroc(cv_adata)
281
282            gc.collect()
283
284        del cv_adata
285        gc.collect()
286        
287    # Take AUROC mean across the k folds to find alpha yielding highest AUROC
288    alpha_star = alpha_array[np.argmax(np.mean(auc_array, axis = 1))]
289    gc.collect()
290    
291
292    return alpha_star
def optimize_alpha( adata, group_size=None, tfidf=False, alpha_array=array([1.9, 1.7, 1.5, 1.3, 1.1, 0.9, 0.7, 0.5, 0.3, 0.1]), k=4):
176def optimize_alpha(adata, group_size = None, tfidf = False, 
177                   alpha_array = np.round(np.linspace(1.9,0.1, 10),2), k = 4):
178    '''
179    Iteratively train a grouplasso model and update alpha to find the 
180    parameter yielding best performing sparsity. This function 
181    currently only works for binary experiments.
182
183    Parameters
184    ----------
185    **adata** : *AnnData* | *list[AnnData]*
186        > `AnnData`(s) with `'Z_train'` and `'Z_test'` in 
187        `adata.uns.keys()`.
188
189    **group_size** : *None* | *int*
190        > Argument describing how the features are grouped. If *None*, 
191        `2 * adata.uns['D'] will be used. 
192        For more information see celer documentation.
193
194    **tfidf** : *bool* 
195        > If `True`, TFIDF normalization will be run at each fold.
196    
197    **alpha_array** : *np.ndarray*
198        > Array of all alpha values to be tested.
199
200    **k** : *int*
201        > Number of folds to perform cross validation over.
202            
203    Returns
204    -------
205    **alpha_star** : *int*
206        > The best performing alpha value from cross validation on 
207        training data.
208
209    Examples
210    --------
211    >>> alpha_star = scmkl.optimize_alpha(adata)
212    >>> alpha_star
213    0.1
214
215    See Also
216    --------
217    celer.GroupLasso : 
218    https://mathurinm.github.io/celer/generated/celer.GroupLasso.html
219    '''
220
221    assert isinstance(k, int) and k > 0, ("Must be a positive integer number "
222                                          "of folds")
223
224    import warnings 
225    warnings.filterwarnings('ignore')
226
227    if group_size == None:
228        group_size = adata.uns['D'] * 2
229
230    if type(adata) == list:
231        alpha_star = _multimodal_optimize_alpha(adatas = adata, 
232                                                group_size = group_size, 
233                                                tfidf = tfidf, 
234                                                alpha_array = alpha_array)
235        return alpha_star
236
237    y = adata.obs['labels'].iloc[adata.uns['train_indices']].to_numpy()
238    
239    # Splits the labels evenly between folds
240    pos_indices = np.where(y == np.unique(y)[0])[0]
241    neg_indices = np.setdiff1d(np.arange(len(y)), pos_indices)
242
243    pos_annotations = np.arange(len(pos_indices)) % k
244    neg_annotations = np.arange(len(neg_indices)) % k
245
246    auc_array = np.zeros((len(alpha_array), k))
247
248    gc.collect()
249
250    for fold in np.arange(k):
251
252        cv_adata = adata[adata.uns['train_indices'],:]
253
254        # Create CV train/test indices
255        # Capturing train indices for both classes
256        ftrain_pos = pos_indices[np.where(pos_annotations != fold)[0]]
257        ftrain_neg = neg_indices[np.where(neg_annotations != fold)[0]]
258
259        # Capturing test indices for both classes
260        ftest_pos = pos_indices[np.where(pos_annotations == fold)[0]]
261        ftest_neg = neg_indices[np.where(neg_annotations == fold)[0]]
262
263        # Creating class-balanced train/test split for fold
264        fold_train = np.concatenate((ftrain_pos, ftrain_neg))
265        fold_test = np.concatenate((ftest_pos, ftest_neg))
266
267        cv_adata.uns['train_indices'] = fold_train
268        cv_adata.uns['test_indices'] = fold_test
269
270        if tfidf:
271            cv_adata = tfidf_normalize(cv_adata, binarize= True)
272
273        cv_adata = estimate_sigma(cv_adata, n_features = 200)
274        cv_adata = calculate_z(cv_adata, n_features= 5000)
275
276        gc.collect()
277
278        for i, alpha in enumerate(alpha_array):
279
280            cv_adata = train_model(cv_adata, group_size, alpha = alpha)
281            auc_array[i, fold] = _calculate_auroc(cv_adata)
282
283            gc.collect()
284
285        del cv_adata
286        gc.collect()
287        
288    # Take AUROC mean across the k folds to find alpha yielding highest AUROC
289    alpha_star = alpha_array[np.argmax(np.mean(auc_array, axis = 1))]
290    gc.collect()
291    
292
293    return alpha_star

Iteratively train a grouplasso model and update alpha to find the parameter yielding best performing sparsity. This function currently only works for binary experiments.

Parameters

adata : AnnData | list[AnnData]

AnnData(s) with 'Z_train' and 'Z_test' in adata.uns.keys().

group_size : None | int

Argument describing how the features are grouped. If None, `2 * adata.uns['D'] will be used. For more information see celer documentation.

tfidf : bool

If True, TFIDF normalization will be run at each fold.

alpha_array : np.ndarray

Array of all alpha values to be tested.

k : int

Number of folds to perform cross validation over.

Returns

alpha_star : int

The best performing alpha value from cross validation on training data.

Examples

>>> alpha_star = scmkl.optimize_alpha(adata)
>>> alpha_star
0.1

See Also

celer.GroupLasso : https://mathurinm.github.io/celer/generated/celer.GroupLasso.html