scmkl.test

  1import numpy as np
  2import sklearn.metrics as skm
  3import anndata as ad
  4
  5
  6def predict(adata: ad.AnnData, metrics: list | None=None,
  7            return_probs: bool=False):
  8    """
  9    Function to return predicted labels and calculate any of AUROC, 
 10    Accuracy, F1 Score, Precision, Recall for a classification. 
 11
 12    **If labeled_test flag in `adata` is set to `False`,
 13    metrics cannot be computed.**
 14    
 15    Parameters
 16    ----------
 17    adata : ad.AnnData
 18        Has keys `'model'`, `'Z_train'`, and `'Z_test'` in `adata.uns`.
 19
 20    metrics : list[str] | None
 21        Which metrics to calculate on the predicted values. Options
 22        are `'AUROC'`, `'Accuracy'`, `'F1-Score'`, `'Precision'`, and 
 23        `'Recall'`. If `None`, all five metrics are calculated.
 24
 25    return_probs : bool
 26        If `True`, will return a dictionary with class probabilities.
 27
 28    Returns
 29    -------
 30    y_pred : np.ndarray
 31        Predicted cell classes.
 32
 33    metrics_dict : dict
 34        Contains `'AUROC'`, `'Accuracy'`, `'F1-Score'`, 
 35        `'Precision'`, and/or `'Recall'` keys depending on metrics 
 36        argument.
 37
 38    probs : dict
 39        If `return_probs` is `True`, will return a dictionary with 
 40        probabilities for each class in `y_test`.
 41
 42    Examples
 43    --------
 44    >>> adata = scmkl.estimate_sigma(adata)
 45    >>> adata = scmkl.calculate_z(adata)
 46    >>> metrics = ['AUROC', 'F1-Score', 'Accuracy', 'Precision', 
 47    ...            'Recall']
 48    >>> adata = scmkl.train_model(adata, metrics = metrics)
 49    >>>
 50    >>> metrics_dict = scmkl.predict(adata)
 51    >>> metrics_dict.keys()
 52    dict_keys(['AUROC', 'Accuracy', 'F1-Score', 'Precision', 'Recall'])
 53    """
 54    X_test = adata.uns['Z_test']
 55
 56    allowed_mets = ['AUROC', 'Accuracy', 'F1-Score', 'Precision', 'Recall']
 57
 58    # Asserting all input metrics are valid
 59    if metrics is not None:
 60        mets_allowed = [metric in allowed_mets for metric in metrics]
 61        assert all(mets_allowed), ("Unknown metric provided. Must be None, "
 62                                   f"or one or more of {allowed_mets}")
 63
 64    # Capturing class labels
 65    train_idx = adata.uns['train_indices']
 66    classes = np.unique(adata.obs['labels'].iloc[train_idx].to_numpy())
 67
 68    # Sigmoid function to force probabilities into [0,1]
 69    probabilities = 1/(1 + np.exp(-adata.uns['model'].predict(X_test)))
 70
 71    #Convert numerical probabilities into binary phenotype
 72    y_pred = np.array(np.repeat(classes[1], X_test.shape[0]), 
 73                      dtype = 'object')
 74    y_pred[np.round(probabilities, 0).astype(int) == 1] = classes[0]
 75
 76    if not adata.uns['labeled_test']:
 77        if not metrics is None:
 78            print("WARNING: Cannot calculate classification metrics "
 79                  "for unlabeled test data")
 80            metrics = None
 81    else:
 82        y_test = adata.obs['labels'].iloc[adata.uns['test_indices']]
 83        y_test = y_test.to_numpy()
 84        X_test = adata.uns['Z_test']
 85        assert X_test.shape[0] == len(y_test), ("X rows and length of y must "
 86                                                "be equal")
 87
 88        # Group Lasso requires 'continous' y values need to re-descritize it
 89        y = np.zeros((len(y_test)))
 90        y[y_test == classes[0]] = 1
 91
 92        metric_dict = {}
 93
 94        if (metrics is None) and (return_probs == False):
 95            return y_pred
 96        
 97        # Calculate and save metrics given in metrics
 98        p_cl = classes[0]
 99        if 'AUROC' in metrics:
100            fpr, tpr, _ = skm.roc_curve(y, probabilities)
101            metric_dict['AUROC'] = skm.auc(fpr, tpr)
102        if 'Accuracy' in metrics:
103            metric_dict['Accuracy'] = np.mean(y_test == y_pred)
104        if 'F1-Score' in metrics:
105            metric_dict['F1-Score'] = skm.f1_score(y_test, y_pred, 
106                                                   pos_label = p_cl)
107        if 'Precision' in metrics:
108            metric_dict['Precision'] = skm.precision_score(y_test, y_pred, 
109                                                           pos_label = p_cl)
110        if 'Recall' in metrics:
111            metric_dict['Recall'] = skm.recall_score(y_test, y_pred, 
112                                                     pos_label = p_cl)
113
114    if return_probs:
115        probs = {classes[0] : probabilities,
116                 classes[1] : 1 - probabilities}
117        if metrics is not None:
118            return y_pred, metric_dict, probs
119        else:
120            return y_pred, probs
121    else:
122        if metrics is not None:
123            return y_pred, metric_dict
124        else:
125            return y_pred
126
127
128def find_selected_groups(adata: ad.AnnData) -> np.ndarray:
129    """
130    Find feature groups selected by the model during training. If 
131    feature weight assigned by the model is non-0, then the group 
132    containing that feature is selected.
133
134    Parameters
135    ----------
136    adata : ad.AnnData
137        Has `celer.GroupLasso` object in `adata.uns['model']`.
138
139    Returns
140    -------
141    selected_groups : np.ndarray
142        Array containing the names of the groups with nonzero kernel 
143        weights.
144
145    Examples
146    --------
147    >>> adata = scmkl.estimate_sigma(adata)
148    >>> adata = scmkl.calculate_z(adata)
149    >>> adata = scmkl.train_model(adata)
150    >>>
151    >>> selected_groups = scmkl.find_selected_groups(adata)
152    >>> selected_groups
153    np.ndarray(['HALLMARK_ESTROGEN_RESPONSE_EARLY', 
154                'HALLMARK_HYPOXIA'])
155    """
156
157    selected_groups = []
158    coefficients = adata.uns['model'].coef_
159    group_size = adata.uns['model'].get_params()['groups']
160    group_names = np.array(list(adata.uns['group_dict'].keys()))
161
162    # Loop over the model weights associated with each group and calculate 
163    # the L2 norm
164    for i, group in enumerate(group_names):
165        if not isinstance(group_size, (list, set, np.ndarray, tuple)):
166            group_start = i * group_size
167            group_end = (i+1) * group_size - 1
168            group_cols = np.arange(group_start, group_end)
169            group_norm = np.linalg.norm(coefficients[group_cols])
170        else: 
171            group_norm = np.linalg.norm(coefficients[group_size[i]])
172
173        # Only include the group if the model weights are > 0 
174        if group_norm != 0:
175            selected_groups.append(group)
176
177    return np.array(selected_groups)
def predict( adata: anndata._core.anndata.AnnData, metrics: list | None = None, return_probs: bool = False):
  7def predict(adata: ad.AnnData, metrics: list | None=None,
  8            return_probs: bool=False):
  9    """
 10    Function to return predicted labels and calculate any of AUROC, 
 11    Accuracy, F1 Score, Precision, Recall for a classification. 
 12
 13    **If labeled_test flag in `adata` is set to `False`,
 14    metrics cannot be computed.**
 15    
 16    Parameters
 17    ----------
 18    adata : ad.AnnData
 19        Has keys `'model'`, `'Z_train'`, and `'Z_test'` in `adata.uns`.
 20
 21    metrics : list[str] | None
 22        Which metrics to calculate on the predicted values. Options
 23        are `'AUROC'`, `'Accuracy'`, `'F1-Score'`, `'Precision'`, and 
 24        `'Recall'`. If `None`, all five metrics are calculated.
 25
 26    return_probs : bool
 27        If `True`, will return a dictionary with class probabilities.
 28
 29    Returns
 30    -------
 31    y_pred : np.ndarray
 32        Predicted cell classes.
 33
 34    metrics_dict : dict
 35        Contains `'AUROC'`, `'Accuracy'`, `'F1-Score'`, 
 36        `'Precision'`, and/or `'Recall'` keys depending on metrics 
 37        argument.
 38
 39    probs : dict
 40        If `return_probs` is `True`, will return a dictionary with 
 41        probabilities for each class in `y_test`.
 42
 43    Examples
 44    --------
 45    >>> adata = scmkl.estimate_sigma(adata)
 46    >>> adata = scmkl.calculate_z(adata)
 47    >>> metrics = ['AUROC', 'F1-Score', 'Accuracy', 'Precision', 
 48    ...            'Recall']
 49    >>> adata = scmkl.train_model(adata, metrics = metrics)
 50    >>>
 51    >>> metrics_dict = scmkl.predict(adata)
 52    >>> metrics_dict.keys()
 53    dict_keys(['AUROC', 'Accuracy', 'F1-Score', 'Precision', 'Recall'])
 54    """
 55    X_test = adata.uns['Z_test']
 56
 57    allowed_mets = ['AUROC', 'Accuracy', 'F1-Score', 'Precision', 'Recall']
 58
 59    # Asserting all input metrics are valid
 60    if metrics is not None:
 61        mets_allowed = [metric in allowed_mets for metric in metrics]
 62        assert all(mets_allowed), ("Unknown metric provided. Must be None, "
 63                                   f"or one or more of {allowed_mets}")
 64
 65    # Capturing class labels
 66    train_idx = adata.uns['train_indices']
 67    classes = np.unique(adata.obs['labels'].iloc[train_idx].to_numpy())
 68
 69    # Sigmoid function to force probabilities into [0,1]
 70    probabilities = 1/(1 + np.exp(-adata.uns['model'].predict(X_test)))
 71
 72    #Convert numerical probabilities into binary phenotype
 73    y_pred = np.array(np.repeat(classes[1], X_test.shape[0]), 
 74                      dtype = 'object')
 75    y_pred[np.round(probabilities, 0).astype(int) == 1] = classes[0]
 76
 77    if not adata.uns['labeled_test']:
 78        if not metrics is None:
 79            print("WARNING: Cannot calculate classification metrics "
 80                  "for unlabeled test data")
 81            metrics = None
 82    else:
 83        y_test = adata.obs['labels'].iloc[adata.uns['test_indices']]
 84        y_test = y_test.to_numpy()
 85        X_test = adata.uns['Z_test']
 86        assert X_test.shape[0] == len(y_test), ("X rows and length of y must "
 87                                                "be equal")
 88
 89        # Group Lasso requires 'continous' y values need to re-descritize it
 90        y = np.zeros((len(y_test)))
 91        y[y_test == classes[0]] = 1
 92
 93        metric_dict = {}
 94
 95        if (metrics is None) and (return_probs == False):
 96            return y_pred
 97        
 98        # Calculate and save metrics given in metrics
 99        p_cl = classes[0]
100        if 'AUROC' in metrics:
101            fpr, tpr, _ = skm.roc_curve(y, probabilities)
102            metric_dict['AUROC'] = skm.auc(fpr, tpr)
103        if 'Accuracy' in metrics:
104            metric_dict['Accuracy'] = np.mean(y_test == y_pred)
105        if 'F1-Score' in metrics:
106            metric_dict['F1-Score'] = skm.f1_score(y_test, y_pred, 
107                                                   pos_label = p_cl)
108        if 'Precision' in metrics:
109            metric_dict['Precision'] = skm.precision_score(y_test, y_pred, 
110                                                           pos_label = p_cl)
111        if 'Recall' in metrics:
112            metric_dict['Recall'] = skm.recall_score(y_test, y_pred, 
113                                                     pos_label = p_cl)
114
115    if return_probs:
116        probs = {classes[0] : probabilities,
117                 classes[1] : 1 - probabilities}
118        if metrics is not None:
119            return y_pred, metric_dict, probs
120        else:
121            return y_pred, probs
122    else:
123        if metrics is not None:
124            return y_pred, metric_dict
125        else:
126            return y_pred

Function to return predicted labels and calculate any of AUROC, Accuracy, F1 Score, Precision, Recall for a classification.

If labeled_test flag in adata is set to False, metrics cannot be computed.

Parameters
  • adata (ad.AnnData): Has keys 'model', 'Z_train', and 'Z_test' in adata.uns.
  • metrics (list[str] | None): Which metrics to calculate on the predicted values. Options are 'AUROC', 'Accuracy', 'F1-Score', 'Precision', and 'Recall'. If None, all five metrics are calculated.
  • return_probs (bool): If True, will return a dictionary with class probabilities.
Returns
  • y_pred (np.ndarray): Predicted cell classes.
  • metrics_dict (dict): Contains 'AUROC', 'Accuracy', 'F1-Score', 'Precision', and/or 'Recall' keys depending on metrics argument.
  • probs (dict): If return_probs is True, will return a dictionary with probabilities for each class in y_test.
Examples
>>> adata = scmkl.estimate_sigma(adata)
>>> adata = scmkl.calculate_z(adata)
>>> metrics = ['AUROC', 'F1-Score', 'Accuracy', 'Precision', 
...            'Recall']
>>> adata = scmkl.train_model(adata, metrics = metrics)
>>>
>>> metrics_dict = scmkl.predict(adata)
>>> metrics_dict.keys()
dict_keys(['AUROC', 'Accuracy', 'F1-Score', 'Precision', 'Recall'])
def find_selected_groups(adata: anndata._core.anndata.AnnData) -> numpy.ndarray:
129def find_selected_groups(adata: ad.AnnData) -> np.ndarray:
130    """
131    Find feature groups selected by the model during training. If 
132    feature weight assigned by the model is non-0, then the group 
133    containing that feature is selected.
134
135    Parameters
136    ----------
137    adata : ad.AnnData
138        Has `celer.GroupLasso` object in `adata.uns['model']`.
139
140    Returns
141    -------
142    selected_groups : np.ndarray
143        Array containing the names of the groups with nonzero kernel 
144        weights.
145
146    Examples
147    --------
148    >>> adata = scmkl.estimate_sigma(adata)
149    >>> adata = scmkl.calculate_z(adata)
150    >>> adata = scmkl.train_model(adata)
151    >>>
152    >>> selected_groups = scmkl.find_selected_groups(adata)
153    >>> selected_groups
154    np.ndarray(['HALLMARK_ESTROGEN_RESPONSE_EARLY', 
155                'HALLMARK_HYPOXIA'])
156    """
157
158    selected_groups = []
159    coefficients = adata.uns['model'].coef_
160    group_size = adata.uns['model'].get_params()['groups']
161    group_names = np.array(list(adata.uns['group_dict'].keys()))
162
163    # Loop over the model weights associated with each group and calculate 
164    # the L2 norm
165    for i, group in enumerate(group_names):
166        if not isinstance(group_size, (list, set, np.ndarray, tuple)):
167            group_start = i * group_size
168            group_end = (i+1) * group_size - 1
169            group_cols = np.arange(group_start, group_end)
170            group_norm = np.linalg.norm(coefficients[group_cols])
171        else: 
172            group_norm = np.linalg.norm(coefficients[group_size[i]])
173
174        # Only include the group if the model weights are > 0 
175        if group_norm != 0:
176            selected_groups.append(group)
177
178    return np.array(selected_groups)

Find feature groups selected by the model during training. If feature weight assigned by the model is non-0, then the group containing that feature is selected.

Parameters
  • adata (ad.AnnData): Has celer.GroupLasso object in adata.uns['model'].
Returns
  • selected_groups (np.ndarray): Array containing the names of the groups with nonzero kernel weights.
Examples
>>> adata = scmkl.estimate_sigma(adata)
>>> adata = scmkl.calculate_z(adata)
>>> adata = scmkl.train_model(adata)
>>>
>>> selected_groups = scmkl.find_selected_groups(adata)
>>> selected_groups
np.ndarray(['HALLMARK_ESTROGEN_RESPONSE_EARLY', 
            'HALLMARK_HYPOXIA'])