scmkl.optimize_alpha

View Source

  1import numpy as np
  2import gc
  3import tracemalloc
  4
  5from scmkl.tfidf_normalize import tfidf_normalize
  6from scmkl.estimate_sigma import estimate_sigma
  7from scmkl.calculate_z import calculate_z
  8from scmkl.train_model import train_model
  9from scmkl.multimodal_processing import multimodal_processing
 10from scmkl.test import predict
 11
 12
 13def _multimodal_optimize_alpha(adatas : list, group_size = 1, tfidf = [False, False],
 14                               alpha_array = np.round(np.linspace(1.9,0.1, 10),2), k = 4,
 15                               metric = 'AUROC'):
 16    '''
 17    Iteratively train a grouplasso model and update alpha to find the parameter yielding the desired sparsity.
 18    This function is meant to find a good starting point for your model, and the alpha may need further fine tuning.
 19    Input:
 20        adatas- a list of AnnData objects where each object is one modality and Z_train and Z_test are calculated
 21        group_size- Argument describing how the features are grouped. 
 22            From Celer documentation:
 23            "groupsint | list of ints | list of lists of ints.
 24                Partition of features used in the penalty on w. 
 25                    If an int is passed, groups are contiguous blocks of features, of size groups. 
 26                    If a list of ints is passed, groups are assumed to be contiguous, group number g being of size groups[g]. 
 27                    If a list of lists of ints is passed, groups[g] contains the feature indices of the group number g."
 28            If 1, model will behave identically to Lasso Regression.
 29        tifidf_list- a boolean mask where tfidf_list[0] and tfidf_list[1] are respective to adata1 and adata2
 30            If True, tfidf normalization will be applied to the respective adata during cross validation
 31        starting_alpha- The alpha value to start the search at.
 32        alpha_array- Numpy array of all alpha values to be tested
 33        k- number of folds to perform cross validation over
 34            
 35    Output:
 36        sparsity_dict- Dictionary with tested alpha as keys and the number of selected pathways as the values
 37        alpha- The alpha value yielding the number of selected groups closest to the target.
 38    '''
 39
 40    assert isinstance(k, int) and k > 0, 'Must be a positive integer number of folds'
 41
 42    import warnings 
 43    warnings.filterwarnings('ignore')
 44
 45    y = adatas[0].obs['labels'].iloc[adatas[0].uns['train_indices']].to_numpy()
 46    
 47    # Splits the labels evenly between folds
 48    positive_indices = np.where(y == np.unique(y)[0])[0]
 49    negative_indices = np.setdiff1d(np.arange(len(y)), positive_indices)
 50
 51    positive_annotations = np.arange(len(positive_indices)) % k
 52    negative_annotations = np.arange(len(negative_indices)) % k
 53
 54    metric_array = np.zeros((len(alpha_array), k))
 55
 56    cv_adatas = []
 57
 58    for adata in adatas:
 59        cv_adatas.append(adata[adata.uns['train_indices'],:].copy())
 60
 61    del adatas
 62    gc.collect()
 63
 64    for fold in np.arange(k):
 65        
 66        print(f'Fold {fold + 1}:\n Memory Usage: {[mem / 1e9 for mem in tracemalloc.get_traced_memory()]} GB')
 67
 68        fold_train = np.concatenate((positive_indices[np.where(positive_annotations != fold)[0]], negative_indices[np.where(negative_annotations != fold)[0]]))
 69        fold_test = np.concatenate((positive_indices[np.where(positive_annotations == fold)[0]], negative_indices[np.where(negative_annotations == fold)[0]]))
 70
 71        for i in range(len(cv_adatas)):
 72            cv_adatas[i].uns['train_indices'] = fold_train
 73            cv_adatas[i].uns['test_indices'] = fold_test
 74
 75        # Creating dummy names for cv. 
 76        # #Necessary for interpretability but not for AUROC cv
 77        dummy_names = [f'adata {i}' for i in range(len(cv_adatas))]
 78
 79        # Calculate the Z's for each modality independently
 80        fold_cv_adata = multimodal_processing(adatas = cv_adatas, names = dummy_names, tfidf = tfidf)
 81        fold_cv_adata.uns['seed_obj'] = cv_adatas[0].uns['seed_obj']
 82
 83        gc.collect()
 84
 85        for j, alpha in enumerate(alpha_array):
 86
 87            fold_cv_adata = train_model(fold_cv_adata, group_size, alpha = alpha)
 88
 89            _, metrics = predict(fold_cv_adata, metrics = [metric])
 90            metric_array[j, fold] = metrics[metric]
 91
 92        del fold_cv_adata
 93        gc.collect()
 94
 95    # Take AUROC mean across the k folds and select the alpha resulting in highest AUROC
 96    alpha_star = alpha_array[np.argmax(np.mean(metric_array, axis = 1))]
 97    del cv_adatas
 98    gc.collect()
 99    
100    return alpha_star
101
102
103def optimize_alpha(adata, group_size = None, tfidf = False, 
104                   alpha_array = np.round(np.linspace(1.9,0.1, 10),2), 
105                   k = 4, metric = 'AUROC'):
106    '''
107    Iteratively train a grouplasso model and update alpha to find the 
108    parameter yielding best performing sparsity. This function 
109    currently only works for binary experiments.
110
111    Parameters
112    ----------
113    **adata** : *AnnData* | *list[AnnData]*
114        > `AnnData`(s) with `'Z_train'` and `'Z_test'` in 
115        `adata.uns.keys()`.
116
117    **group_size** : *None* | *int*
118        > Argument describing how the features are grouped. If *None*, 
119        `2 * adata.uns['D'] will be used. 
120        For more information see 
121        [celer documentation](https://mathurinm.github.io/celer/generated/celer.GroupLasso.html).
122
123    **tfidf** : *bool* 
124        > If `True`, TFIDF normalization will be run at each fold.
125    
126    **alpha_array** : *np.ndarray*
127        > Array of all alpha values to be tested.
128
129    **k** : *int*
130        > Number of folds to perform cross validation over.
131            
132    **metric**: *str*
133        > Which metric to use to optimize alpha. Options
134        are `'AUROC'`, `'Accuracy'`, `'F1-Score'`, `'Precision'`, and 
135        `'Recall'`
136
137    Returns
138    -------
139    **alpha_star** : *int*
140        > The best performing alpha value from cross validation on 
141        training data.
142
143    Examples
144    --------
145    >>> alpha_star = scmkl.optimize_alpha(adata)
146    >>> alpha_star
147    0.1
148    '''
149
150    assert isinstance(k, int) and k > 0, 'Must be a positive integer number of folds'
151
152    import warnings 
153    warnings.filterwarnings('ignore')
154
155    if group_size == None:
156        group_size = adata.uns['D'] * 2
157
158    if type(adata) == list:
159        alpha_star = _multimodal_optimize_alpha(adatas = adata, group_size = group_size, tfidf = tfidf, alpha_array = alpha_array, metric = metric)
160        return alpha_star
161
162    y = adata.obs['labels'].iloc[adata.uns['train_indices']].to_numpy()
163    
164    # Splits the labels evenly between folds
165    positive_indices = np.where(y == np.unique(y)[0])[0]
166    negative_indices = np.setdiff1d(np.arange(len(y)), positive_indices)
167
168    positive_annotations = np.arange(len(positive_indices)) % k
169    negative_annotations = np.arange(len(negative_indices)) % k
170
171    metric_array = np.zeros((len(alpha_array), k))
172
173    gc.collect()
174
175    for fold in np.arange(k):
176
177        cv_adata = adata[adata.uns['train_indices'],:]
178
179        # Create CV train/test indices
180        fold_train = np.concatenate((positive_indices[np.where(positive_annotations != fold)[0]], 
181                                     negative_indices[np.where(negative_annotations != fold)[0]]))
182        fold_test = np.concatenate((positive_indices[np.where(positive_annotations == fold)[0]], 
183                                    negative_indices[np.where(negative_annotations == fold)[0]]))
184
185        cv_adata.uns['train_indices'] = fold_train
186        cv_adata.uns['test_indices'] = fold_test
187
188        if tfidf:
189            cv_adata = tfidf_normalize(cv_adata, binarize= True)
190
191        cv_adata = estimate_sigma(cv_adata, n_features = 200)
192        cv_adata = calculate_z(cv_adata, n_features= 5000)
193
194        gc.collect()
195
196        for i, alpha in enumerate(alpha_array):
197
198            cv_adata = train_model(cv_adata, group_size, alpha = alpha)
199            _, metrics = predict(cv_adata, metrics = [metric])
200            metric_array[i, fold] = metrics[metric]
201
202            gc.collect()
203
204        del cv_adata
205        gc.collect()
206        
207    # Take AUROC mean across the k folds to find alpha yielding highest AUROC
208    alpha_star = alpha_array[np.argmax(np.mean(metric_array, axis = 1))]
209    gc.collect()
210    
211
212    return alpha_star

def optimize_alpha( adata, group_size=None, tfidf=False, alpha_array=array([1.9, 1.7, 1.5, 1.3, 1.1, 0.9, 0.7, 0.5, 0.3, 0.1]), k=4, metric='AUROC'): View Source

104def optimize_alpha(adata, group_size = None, tfidf = False, 
105                   alpha_array = np.round(np.linspace(1.9,0.1, 10),2), 
106                   k = 4, metric = 'AUROC'):
107    '''
108    Iteratively train a grouplasso model and update alpha to find the 
109    parameter yielding best performing sparsity. This function 
110    currently only works for binary experiments.
111
112    Parameters
113    ----------
114    **adata** : *AnnData* | *list[AnnData]*
115        > `AnnData`(s) with `'Z_train'` and `'Z_test'` in 
116        `adata.uns.keys()`.
117
118    **group_size** : *None* | *int*
119        > Argument describing how the features are grouped. If *None*, 
120        `2 * adata.uns['D'] will be used. 
121        For more information see 
122        [celer documentation](https://mathurinm.github.io/celer/generated/celer.GroupLasso.html).
123
124    **tfidf** : *bool* 
125        > If `True`, TFIDF normalization will be run at each fold.
126    
127    **alpha_array** : *np.ndarray*
128        > Array of all alpha values to be tested.
129
130    **k** : *int*
131        > Number of folds to perform cross validation over.
132            
133    **metric**: *str*
134        > Which metric to use to optimize alpha. Options
135        are `'AUROC'`, `'Accuracy'`, `'F1-Score'`, `'Precision'`, and 
136        `'Recall'`
137
138    Returns
139    -------
140    **alpha_star** : *int*
141        > The best performing alpha value from cross validation on 
142        training data.
143
144    Examples
145    --------
146    >>> alpha_star = scmkl.optimize_alpha(adata)
147    >>> alpha_star
148    0.1
149    '''
150
151    assert isinstance(k, int) and k > 0, 'Must be a positive integer number of folds'
152
153    import warnings 
154    warnings.filterwarnings('ignore')
155
156    if group_size == None:
157        group_size = adata.uns['D'] * 2
158
159    if type(adata) == list:
160        alpha_star = _multimodal_optimize_alpha(adatas = adata, group_size = group_size, tfidf = tfidf, alpha_array = alpha_array, metric = metric)
161        return alpha_star
162
163    y = adata.obs['labels'].iloc[adata.uns['train_indices']].to_numpy()
164    
165    # Splits the labels evenly between folds
166    positive_indices = np.where(y == np.unique(y)[0])[0]
167    negative_indices = np.setdiff1d(np.arange(len(y)), positive_indices)
168
169    positive_annotations = np.arange(len(positive_indices)) % k
170    negative_annotations = np.arange(len(negative_indices)) % k
171
172    metric_array = np.zeros((len(alpha_array), k))
173
174    gc.collect()
175
176    for fold in np.arange(k):
177
178        cv_adata = adata[adata.uns['train_indices'],:]
179
180        # Create CV train/test indices
181        fold_train = np.concatenate((positive_indices[np.where(positive_annotations != fold)[0]], 
182                                     negative_indices[np.where(negative_annotations != fold)[0]]))
183        fold_test = np.concatenate((positive_indices[np.where(positive_annotations == fold)[0]], 
184                                    negative_indices[np.where(negative_annotations == fold)[0]]))
185
186        cv_adata.uns['train_indices'] = fold_train
187        cv_adata.uns['test_indices'] = fold_test
188
189        if tfidf:
190            cv_adata = tfidf_normalize(cv_adata, binarize= True)
191
192        cv_adata = estimate_sigma(cv_adata, n_features = 200)
193        cv_adata = calculate_z(cv_adata, n_features= 5000)
194
195        gc.collect()
196
197        for i, alpha in enumerate(alpha_array):
198
199            cv_adata = train_model(cv_adata, group_size, alpha = alpha)
200            _, metrics = predict(cv_adata, metrics = [metric])
201            metric_array[i, fold] = metrics[metric]
202
203            gc.collect()
204
205        del cv_adata
206        gc.collect()
207        
208    # Take AUROC mean across the k folds to find alpha yielding highest AUROC
209    alpha_star = alpha_array[np.argmax(np.mean(metric_array, axis = 1))]
210    gc.collect()
211    
212
213    return alpha_star

Iteratively train a grouplasso model and update alpha to find the parameter yielding best performing sparsity. This function currently only works for binary experiments.

Parameters

adata : AnnData | list[AnnData]

AnnData(s) with 'Z_train' and 'Z_test' in adata.uns.keys().

group_size : None | int

Argument describing how the features are grouped. If None, `2 * adata.uns['D'] will be used. For more information see celer documentation.

tfidf : bool

If True, TFIDF normalization will be run at each fold.

alpha_array : np.ndarray

Array of all alpha values to be tested.

k : int

Number of folds to perform cross validation over.

metric: str

Which metric to use to optimize alpha. Options are 'AUROC', 'Accuracy', 'F1-Score', 'Precision', and 'Recall'

Returns

alpha_star : int

The best performing alpha value from cross validation on training data.

Examples

>>> alpha_star = scmkl.optimize_alpha(adata)
>>> alpha_star
0.1