scmkl.test

View Source

  1import numpy as np
  2import sklearn.metrics as skm
  3
  4
  5def predict(adata, metrics = None, return_probs = False):
  6    '''
  7    Function to return predicted labels and calculate any of AUROC, 
  8    Accuracy, F1 Score, Precision, Recall for a classification. 
  9
 10    ** If labeled_test flag is set to False, metrics cannot be 
 11    computed.**
 12    
 13    Parameters
 14    ----------  
 15    **adata** : *AnnData*
 16        > Has keys `'model'`, `'Z_train'`, and `'Z_test'` in 
 17        `adata.uns`.
 18
 19    **metrics** : *list[str]* | *None*
 20        > Which metrics to calculate on the predicted values. Options
 21        are `'AUROC'`, `'Accuracy'`, `'F1-Score'`, `'Precision'`, and 
 22        `'Recall'`.
 23
 24    **return_probs** : *bool*
 25        > If `True`, will return a dictionary with class probabilities.
 26
 27    Returns
 28    -------
 29    **y_pred** : *np.ndarray*
 30        > Predicted cell classes.
 31
 32    **metrics_dict** : *dict*
 33        > Contains `'AUROC'`, `'Accuracy'`, `'F1-Score'`, 
 34        `'Precision'`, and/or `'Recall'` keys depending on metrics 
 35        argument.
 36
 37    **probs** : *dict*
 38        > If `return_probs` is `True`, will return a dictionary with 
 39        probabilities for each class in `y_test`.
 40
 41    Examples
 42    --------
 43    >>> adata = scmkl.estimate_sigma(adata)
 44    >>> adata = scmkl.calculate_z(adata)
 45    >>> metrics = ['AUROC', 'F1-Score', 'Accuracy', 'Precision', 
 46    ...            'Recall']
 47    >>> adata = scmkl.train_model(adata, metrics = metrics)
 48    >>>
 49    >>> metrics_dict = scmkl.predict(adata)
 50    >>> metrics_dict.keys()
 51    dict_keys(['AUROC', 'Accuracy', 'F1-Score', 'Precision', 'Recall'])
 52    '''
 53    X_test = adata.uns['Z_test']
 54
 55    allowed_mets = ['AUROC', 'Accuracy', 'F1-Score', 'Precision', 'Recall']
 56
 57    # Asserting all input metrics are valid
 58    if metrics is not None:
 59        mets_allowed = [metric in allowed_mets for metric in metrics]
 60        assert all(mets_allowed), ("Unknown metric provided. Must be None, "
 61                                   f"or one or more of {allowed_mets}")
 62
 63    # Capturing class labels
 64    train_idx = adata.uns['train_indices']
 65    classes = np.unique(adata.obs['labels'].iloc[train_idx].to_numpy())
 66
 67    # Sigmoid function to force probabilities into [0,1]
 68    probabilities = 1 / (1 + np.exp(-adata.uns['model'].predict(X_test)))
 69
 70    #Convert numerical probabilities into binary phenotype
 71    y_pred = np.array(np.repeat(classes[1], X_test.shape[0]), 
 72                      dtype = 'object')
 73    y_pred[np.round(probabilities, 0).astype(int) == 1] = classes[0]
 74
 75    if not adata.uns['labeled_test']:
 76        if not metrics is None:
 77            print("WARNING: Cannot calculate classification metrics "
 78                  "for unlabeled test data")
 79            metrics = None
 80    else:
 81        y_test = adata.obs['labels'].iloc[adata.uns['test_indices']]
 82        y_test = y_test.to_numpy()
 83        X_test = adata.uns['Z_test']
 84        assert X_test.shape[0] == len(y_test), ("X rows and length of y must "
 85                                                "be equal")
 86
 87        # Group Lasso requires 'continous' y values need to re-descritize it
 88        y = np.zeros((len(y_test)))
 89        y[y_test == classes[0]] = 1
 90
 91        metric_dict = {}
 92
 93        if (metrics is None) and (return_probs == False):
 94            return y_pred
 95        
 96        # Calculate and save metrics given in metrics
 97        p_cl = classes[0]
 98        if 'AUROC' in metrics:
 99            fpr, tpr, _ = skm.roc_curve(y, probabilities)
100            metric_dict['AUROC'] = skm.auc(fpr, tpr)
101        if 'Accuracy' in metrics:
102            metric_dict['Accuracy'] = np.mean(y_test == y_pred)
103        if 'F1-Score' in metrics:
104            metric_dict['F1-Score'] = skm.f1_score(y_test, y_pred, 
105                                                   pos_label = p_cl)
106        if 'Precision' in metrics:
107            metric_dict['Precision'] = skm.precision_score(y_test, y_pred, 
108                                                           pos_label = p_cl)
109        if 'Recall' in metrics:
110            metric_dict['Recall'] = skm.recall_score(y_test, y_pred, 
111                                                     pos_label = p_cl)
112
113    if return_probs:
114        probs = {classes[0] : probabilities,
115                 classes[1] : 1 - probabilities}
116        if metrics is not None:
117            return y_pred, metric_dict, probs
118        else:
119            return y_pred, probs
120    else:
121        if metrics is not None:
122            return y_pred, metric_dict
123        else:
124            return y_pred
125
126
127def find_selected_groups(adata) -> np.ndarray:
128    '''
129    Find feature groups selected by the model during training. If 
130    feature weight assigned by the model is non-0, then the group 
131    containing that feature is selected.
132
133    Parameters
134    ----------
135    **adata** : *AnnData*
136        > Has *celer.GroupLasso* object in `adata.uns['model']`.
137
138    Returns
139    -------
140    **selected_groups** : *np.ndarray*
141        > Array containing the names of the groups with nonzero kernel 
142        weights.
143
144    Examples
145    --------
146    >>> adata = scmkl.estimate_sigma(adata)
147    >>> adata = scmkl.calculate_z(adata)
148    >>> adata = scmkl.train_model(adata)
149    >>>
150    >>> selected_groups = scmkl.find_selected_groups(adata)
151    >>> selected_groups
152    np.ndarray(['HALLMARK_ESTROGEN_RESPONSE_EARLY', 
153                'HALLMARK_HYPOXIA'])
154    '''
155
156    selected_groups = []
157    coefficients = adata.uns['model'].coef_
158    group_size = adata.uns['model'].get_params()['groups']
159    group_names = np.array(list(adata.uns['group_dict'].keys()))
160
161    # Loop over the model weights associated with each group and calculate 
162    # the L2 norm
163    for i, group in enumerate(group_names):
164        if not isinstance(group_size, (list, set, np.ndarray, tuple)):
165            group_start = i * group_size
166            group_end = (i+1) * group_size - 1
167            group_cols = np.arange(group_start, group_end)
168            group_norm = np.linalg.norm(coefficients[group_cols])
169        else: 
170            group_norm = np.linalg.norm(coefficients[group_size[i]])
171
172        # Only include the group if the model weights are > 0 
173        if group_norm != 0:
174            selected_groups.append(group)
175
176    return np.array(selected_groups)

def predict(adata, metrics=None, return_probs=False): View Source

  6def predict(adata, metrics = None, return_probs = False):
  7    '''
  8    Function to return predicted labels and calculate any of AUROC, 
  9    Accuracy, F1 Score, Precision, Recall for a classification. 
 10
 11    ** If labeled_test flag is set to False, metrics cannot be 
 12    computed.**
 13    
 14    Parameters
 15    ----------  
 16    **adata** : *AnnData*
 17        > Has keys `'model'`, `'Z_train'`, and `'Z_test'` in 
 18        `adata.uns`.
 19
 20    **metrics** : *list[str]* | *None*
 21        > Which metrics to calculate on the predicted values. Options
 22        are `'AUROC'`, `'Accuracy'`, `'F1-Score'`, `'Precision'`, and 
 23        `'Recall'`.
 24
 25    **return_probs** : *bool*
 26        > If `True`, will return a dictionary with class probabilities.
 27
 28    Returns
 29    -------
 30    **y_pred** : *np.ndarray*
 31        > Predicted cell classes.
 32
 33    **metrics_dict** : *dict*
 34        > Contains `'AUROC'`, `'Accuracy'`, `'F1-Score'`, 
 35        `'Precision'`, and/or `'Recall'` keys depending on metrics 
 36        argument.
 37
 38    **probs** : *dict*
 39        > If `return_probs` is `True`, will return a dictionary with 
 40        probabilities for each class in `y_test`.
 41
 42    Examples
 43    --------
 44    >>> adata = scmkl.estimate_sigma(adata)
 45    >>> adata = scmkl.calculate_z(adata)
 46    >>> metrics = ['AUROC', 'F1-Score', 'Accuracy', 'Precision', 
 47    ...            'Recall']
 48    >>> adata = scmkl.train_model(adata, metrics = metrics)
 49    >>>
 50    >>> metrics_dict = scmkl.predict(adata)
 51    >>> metrics_dict.keys()
 52    dict_keys(['AUROC', 'Accuracy', 'F1-Score', 'Precision', 'Recall'])
 53    '''
 54    X_test = adata.uns['Z_test']
 55
 56    allowed_mets = ['AUROC', 'Accuracy', 'F1-Score', 'Precision', 'Recall']
 57
 58    # Asserting all input metrics are valid
 59    if metrics is not None:
 60        mets_allowed = [metric in allowed_mets for metric in metrics]
 61        assert all(mets_allowed), ("Unknown metric provided. Must be None, "
 62                                   f"or one or more of {allowed_mets}")
 63
 64    # Capturing class labels
 65    train_idx = adata.uns['train_indices']
 66    classes = np.unique(adata.obs['labels'].iloc[train_idx].to_numpy())
 67
 68    # Sigmoid function to force probabilities into [0,1]
 69    probabilities = 1 / (1 + np.exp(-adata.uns['model'].predict(X_test)))
 70
 71    #Convert numerical probabilities into binary phenotype
 72    y_pred = np.array(np.repeat(classes[1], X_test.shape[0]), 
 73                      dtype = 'object')
 74    y_pred[np.round(probabilities, 0).astype(int) == 1] = classes[0]
 75
 76    if not adata.uns['labeled_test']:
 77        if not metrics is None:
 78            print("WARNING: Cannot calculate classification metrics "
 79                  "for unlabeled test data")
 80            metrics = None
 81    else:
 82        y_test = adata.obs['labels'].iloc[adata.uns['test_indices']]
 83        y_test = y_test.to_numpy()
 84        X_test = adata.uns['Z_test']
 85        assert X_test.shape[0] == len(y_test), ("X rows and length of y must "
 86                                                "be equal")
 87
 88        # Group Lasso requires 'continous' y values need to re-descritize it
 89        y = np.zeros((len(y_test)))
 90        y[y_test == classes[0]] = 1
 91
 92        metric_dict = {}
 93
 94        if (metrics is None) and (return_probs == False):
 95            return y_pred
 96        
 97        # Calculate and save metrics given in metrics
 98        p_cl = classes[0]
 99        if 'AUROC' in metrics:
100            fpr, tpr, _ = skm.roc_curve(y, probabilities)
101            metric_dict['AUROC'] = skm.auc(fpr, tpr)
102        if 'Accuracy' in metrics:
103            metric_dict['Accuracy'] = np.mean(y_test == y_pred)
104        if 'F1-Score' in metrics:
105            metric_dict['F1-Score'] = skm.f1_score(y_test, y_pred, 
106                                                   pos_label = p_cl)
107        if 'Precision' in metrics:
108            metric_dict['Precision'] = skm.precision_score(y_test, y_pred, 
109                                                           pos_label = p_cl)
110        if 'Recall' in metrics:
111            metric_dict['Recall'] = skm.recall_score(y_test, y_pred, 
112                                                     pos_label = p_cl)
113
114    if return_probs:
115        probs = {classes[0] : probabilities,
116                 classes[1] : 1 - probabilities}
117        if metrics is not None:
118            return y_pred, metric_dict, probs
119        else:
120            return y_pred, probs
121    else:
122        if metrics is not None:
123            return y_pred, metric_dict
124        else:
125            return y_pred

Function to return predicted labels and calculate any of AUROC, Accuracy, F1 Score, Precision, Recall for a classification.

* If labeled_test flag is set to False, metrics cannot be computed.*

Parameters

adata : AnnData

Has keys 'model', 'Z_train', and 'Z_test' in adata.uns.

metrics : list[str] | None

Which metrics to calculate on the predicted values. Options are 'AUROC', 'Accuracy', 'F1-Score', 'Precision', and 'Recall'.

return_probs : bool

If True, will return a dictionary with class probabilities.

Returns

y_pred : np.ndarray

Predicted cell classes.

metrics_dict : dict

Contains 'AUROC', 'Accuracy', 'F1-Score', 'Precision', and/or 'Recall' keys depending on metrics argument.

probs : dict

If return_probs is True, will return a dictionary with probabilities for each class in y_test.

Examples

>>> adata = scmkl.estimate_sigma(adata)
>>> adata = scmkl.calculate_z(adata)
>>> metrics = ['AUROC', 'F1-Score', 'Accuracy', 'Precision', 
...            'Recall']
>>> adata = scmkl.train_model(adata, metrics = metrics)
>>>
>>> metrics_dict = scmkl.predict(adata)
>>> metrics_dict.keys()
dict_keys(['AUROC', 'Accuracy', 'F1-Score', 'Precision', 'Recall'])

def find_selected_groups(adata) -> numpy.ndarray: View Source

128def find_selected_groups(adata) -> np.ndarray:
129    '''
130    Find feature groups selected by the model during training. If 
131    feature weight assigned by the model is non-0, then the group 
132    containing that feature is selected.
133
134    Parameters
135    ----------
136    **adata** : *AnnData*
137        > Has *celer.GroupLasso* object in `adata.uns['model']`.
138
139    Returns
140    -------
141    **selected_groups** : *np.ndarray*
142        > Array containing the names of the groups with nonzero kernel 
143        weights.
144
145    Examples
146    --------
147    >>> adata = scmkl.estimate_sigma(adata)
148    >>> adata = scmkl.calculate_z(adata)
149    >>> adata = scmkl.train_model(adata)
150    >>>
151    >>> selected_groups = scmkl.find_selected_groups(adata)
152    >>> selected_groups
153    np.ndarray(['HALLMARK_ESTROGEN_RESPONSE_EARLY', 
154                'HALLMARK_HYPOXIA'])
155    '''
156
157    selected_groups = []
158    coefficients = adata.uns['model'].coef_
159    group_size = adata.uns['model'].get_params()['groups']
160    group_names = np.array(list(adata.uns['group_dict'].keys()))
161
162    # Loop over the model weights associated with each group and calculate 
163    # the L2 norm
164    for i, group in enumerate(group_names):
165        if not isinstance(group_size, (list, set, np.ndarray, tuple)):
166            group_start = i * group_size
167            group_end = (i+1) * group_size - 1
168            group_cols = np.arange(group_start, group_end)
169            group_norm = np.linalg.norm(coefficients[group_cols])
170        else: 
171            group_norm = np.linalg.norm(coefficients[group_size[i]])
172
173        # Only include the group if the model weights are > 0 
174        if group_norm != 0:
175            selected_groups.append(group)
176
177    return np.array(selected_groups)

Find feature groups selected by the model during training. If feature weight assigned by the model is non-0, then the group containing that feature is selected.

Parameters

adata : AnnData

Has celer.GroupLasso object in adata.uns['model'].

Returns

selected_groups : np.ndarray

Array containing the names of the groups with nonzero kernel weights.

Examples

>>> adata = scmkl.estimate_sigma(adata)
>>> adata = scmkl.calculate_z(adata)
>>> adata = scmkl.train_model(adata)
>>>
>>> selected_groups = scmkl.find_selected_groups(adata)
>>> selected_groups
np.ndarray(['HALLMARK_ESTROGEN_RESPONSE_EARLY', 
            'HALLMARK_HYPOXIA'])