scmkl.one_v_rest

  1import numpy as np
  2import pandas as pd
  3import gc
  4
  5from scmkl.run import run
  6from scmkl.estimate_sigma import estimate_sigma
  7from scmkl.calculate_z import calculate_z
  8from scmkl.multimodal_processing import multimodal_processing
  9from scmkl._checks import _check_adatas
 10
 11
 12def _eval_labels(cell_labels : np.ndarray, train_indices : np.ndarray, 
 13                  test_indices : np.ndarray) -> np.ndarray:
 14    '''
 15    Takes an array of multiclass cell labels and returns a unique array 
 16    of cell labels to test for.
 17
 18    Parameters
 19    ----------
 20    cell_labels : np.ndarray
 21        > Cell labels that coorespond to an AnnData object.
 22
 23    train_indices : np.ndarray
 24        > Indices for the training samples in an AnnData object.
 25    
 26    test_indices - np.ndarray
 27        > Indices for the testing samples in an AnnData object.
 28
 29    remove_labels : bool
 30        > If True, models will only be created for cell labels in both
 31        the training and test data, if False, models will be generated
 32        for all cell labels in the training data.
 33
 34    Returns
 35    -------
 36    uniq_labels : np.ndarray
 37        > Returns a numpy array of unique cell labels to be iterated 
 38        through during one versus all setups.
 39    '''
 40    train_uniq_labels = np.unique(cell_labels[train_indices])
 41    test_uniq_labels = np.unique(cell_labels[test_indices])
 42
 43    # Getting only labels in both training and testing sets
 44    uniq_labels = np.intersect1d(train_uniq_labels, test_uniq_labels)
 45
 46    # Ensuring that at least one cell type label between the two data
 47    #   are the same
 48    cl_intersect = np.intersect1d(train_uniq_labels, test_uniq_labels)
 49    assert len(cl_intersect) > 0, ("There are no common labels between cells "
 50                                   "in the training and testing samples")
 51
 52    return uniq_labels
 53
 54
 55def _prob_table(results : dict, alpha):
 56    '''
 57    Takes a results dictionary with class and probabilities keys and 
 58    returns a table of probabilities for each class and the most 
 59    probable class for each cell.
 60
 61    Parameters
 62    ----------
 63    results : dict
 64        > A nested dictionary that contains a dictionary for each class 
 65        containing probabilities for each cell class.
 66
 67    alpha : float
 68        > A float for which model probabilities should be evaluated 
 69        for.
 70
 71    Returns
 72    -------
 73    prob_table : pd.DataFrame
 74        > Each column is a cell class and the elements are the
 75        class probability outputs from the model.
 76
 77    pred_class : list
 78        > The most probable cell classes respective to the training set 
 79        cells. 
 80    '''
 81    prob_table = {class_ : results[class_]['Probabilities'][alpha][class_]
 82                  for class_ in results.keys()}
 83    prob_table = pd.DataFrame(prob_table)
 84
 85    pred_class = []
 86    maxes = []
 87
 88    for i, row in prob_table.iterrows():
 89        row_max = np.max(row)
 90        indices = np.where(row == row_max)
 91        prediction = prob_table.columns[indices]
 92
 93        if len(prediction) > 1:
 94            prediction = " and ".join(prediction)
 95        else:
 96            prediction = prediction[0]
 97
 98        pred_class.append(prediction)
 99        maxes.append(row_max)
100
101    maxes = np.round(maxes, 0)
102    low_conf = np.invert(np.array(maxes, dtype = np.bool_))
103
104    return prob_table, pred_class, low_conf
105
106
107def one_v_rest(adatas : list, names : list, alpha_list : np.ndarray, 
108              tfidf : list) -> dict:
109    '''
110    For each cell class, creates model(s) comparing that class to all 
111    others. Then, predicts on the training data using `scmkl.run()`.
112    Only labels in both training and testing will be run.
113
114    Parameters
115    ----------
116    **adatas** : *list[AnnData]* 
117        > List of AnnData objects created by create_adata()
118        where each AnnData is one modality and composed of both 
119        training and testing samples. Requires that `'train_indices'`
120        and `'test_indices'` are the same across all AnnDatas.
121
122    **names** : *list[str]* 
123        > List of string variables that describe each modality
124        respective to adatas for labeling.
125        
126    **alpha_list** : *np.ndarray* | *float*
127        > An array of alpha values to create each model with.
128
129    **tfidf** : *list[bool]* 
130        > List where if element i is `True`, adata[i] will be TFIDF 
131        normalized.
132
133    Returns
134    -------
135    **results** : *dict*
136    > Contains keys for each cell class with results from cell class
137    versus all other samples. See `scmkl.run()` for futher details.
138
139    Examples
140    --------
141    >>> adata = scmkl.create_adata(X = data_mat, 
142    ...                            feature_names = gene_names, 
143    ...                            group_dict = group_dict)
144    >>>
145    >>> results = scmkl.one_v_rest(adatas = [adata], names = ['rna'],
146    ...                           alpha_list = np.array([0.05, 0.1]),
147    ...                           tfidf = [False])
148    >>>
149    >>> adata.keys()
150    dict_keys(['B cells', 'Monocytes', 'Dendritic cells', ...])
151    '''
152    # Formatting checks ensuring all adata elements are 
153    # AnnData objects and train/test indices are all the same
154    _check_adatas(adatas, check_obs = True, check_uns = True)
155
156    # Extracting train and test indices
157    train_indices = adatas[0].uns['train_indices']
158    test_indices = adatas[0].uns['test_indices']
159
160    # Checking and capturing cell labels
161    uniq_labels = _eval_labels(  cell_labels = adatas[0].obs['labels'], 
162                                train_indices = train_indices,
163                                 test_indices = test_indices)
164
165
166    # Calculating Z matrices, method depends on whether there are multiple 
167    # adatas (modalities)
168    if len(adatas) == 1:
169        adata = estimate_sigma(adatas[0], n_features = 200)
170        adata = calculate_z(adata, n_features = 5000)
171    else:
172        adata = multimodal_processing(adatas = adatas, 
173                                        names = names, 
174                                        tfidf = tfidf, 
175                                        z_calculation = True)
176
177    del adatas
178    gc.collect()
179
180    # Initializing for capturing model outputs
181    results = {}
182
183    # Capturing cell labels before overwriting
184    cell_labels = np.array(adata.obs['labels'])
185
186    for label in uniq_labels:
187        print(f"Comparing {label} to other types", flush = True)
188        cur_labels = cell_labels.copy()
189        cur_labels[cell_labels != label] = 'other'
190        
191        # Replacing cell labels for current cell type vs rest
192        adata.obs['labels'] = cur_labels
193
194        # Running scMKL
195        results[label] = run(adata, alpha_list, return_probs = True)
196
197    # Getting final predictions
198    alpha = np.min(alpha_list)
199    prob_table, pred_class, low_conf = _prob_table(results, alpha)
200
201    results['Probability_table'] = prob_table
202    results['Predicted_class'] = pred_class
203    results['Truth_labels'] = cell_labels[adata.uns['test_indices']]
204    results['Low_confidence'] = low_conf
205
206    return results
def one_v_rest( adatas: list, names: list, alpha_list: numpy.ndarray, tfidf: list) -> dict:
108def one_v_rest(adatas : list, names : list, alpha_list : np.ndarray, 
109              tfidf : list) -> dict:
110    '''
111    For each cell class, creates model(s) comparing that class to all 
112    others. Then, predicts on the training data using `scmkl.run()`.
113    Only labels in both training and testing will be run.
114
115    Parameters
116    ----------
117    **adatas** : *list[AnnData]* 
118        > List of AnnData objects created by create_adata()
119        where each AnnData is one modality and composed of both 
120        training and testing samples. Requires that `'train_indices'`
121        and `'test_indices'` are the same across all AnnDatas.
122
123    **names** : *list[str]* 
124        > List of string variables that describe each modality
125        respective to adatas for labeling.
126        
127    **alpha_list** : *np.ndarray* | *float*
128        > An array of alpha values to create each model with.
129
130    **tfidf** : *list[bool]* 
131        > List where if element i is `True`, adata[i] will be TFIDF 
132        normalized.
133
134    Returns
135    -------
136    **results** : *dict*
137    > Contains keys for each cell class with results from cell class
138    versus all other samples. See `scmkl.run()` for futher details.
139
140    Examples
141    --------
142    >>> adata = scmkl.create_adata(X = data_mat, 
143    ...                            feature_names = gene_names, 
144    ...                            group_dict = group_dict)
145    >>>
146    >>> results = scmkl.one_v_rest(adatas = [adata], names = ['rna'],
147    ...                           alpha_list = np.array([0.05, 0.1]),
148    ...                           tfidf = [False])
149    >>>
150    >>> adata.keys()
151    dict_keys(['B cells', 'Monocytes', 'Dendritic cells', ...])
152    '''
153    # Formatting checks ensuring all adata elements are 
154    # AnnData objects and train/test indices are all the same
155    _check_adatas(adatas, check_obs = True, check_uns = True)
156
157    # Extracting train and test indices
158    train_indices = adatas[0].uns['train_indices']
159    test_indices = adatas[0].uns['test_indices']
160
161    # Checking and capturing cell labels
162    uniq_labels = _eval_labels(  cell_labels = adatas[0].obs['labels'], 
163                                train_indices = train_indices,
164                                 test_indices = test_indices)
165
166
167    # Calculating Z matrices, method depends on whether there are multiple 
168    # adatas (modalities)
169    if len(adatas) == 1:
170        adata = estimate_sigma(adatas[0], n_features = 200)
171        adata = calculate_z(adata, n_features = 5000)
172    else:
173        adata = multimodal_processing(adatas = adatas, 
174                                        names = names, 
175                                        tfidf = tfidf, 
176                                        z_calculation = True)
177
178    del adatas
179    gc.collect()
180
181    # Initializing for capturing model outputs
182    results = {}
183
184    # Capturing cell labels before overwriting
185    cell_labels = np.array(adata.obs['labels'])
186
187    for label in uniq_labels:
188        print(f"Comparing {label} to other types", flush = True)
189        cur_labels = cell_labels.copy()
190        cur_labels[cell_labels != label] = 'other'
191        
192        # Replacing cell labels for current cell type vs rest
193        adata.obs['labels'] = cur_labels
194
195        # Running scMKL
196        results[label] = run(adata, alpha_list, return_probs = True)
197
198    # Getting final predictions
199    alpha = np.min(alpha_list)
200    prob_table, pred_class, low_conf = _prob_table(results, alpha)
201
202    results['Probability_table'] = prob_table
203    results['Predicted_class'] = pred_class
204    results['Truth_labels'] = cell_labels[adata.uns['test_indices']]
205    results['Low_confidence'] = low_conf
206
207    return results

For each cell class, creates model(s) comparing that class to all others. Then, predicts on the training data using scmkl.run. Only labels in both training and testing will be run.

Parameters

adatas : list[AnnData]

List of AnnData objects created by create_adata() where each AnnData is one modality and composed of both training and testing samples. Requires that 'train_indices' and 'test_indices' are the same across all AnnDatas.

names : list[str]

List of string variables that describe each modality respective to adatas for labeling.

alpha_list : np.ndarray | float

An array of alpha values to create each model with.

tfidf : list[bool]

List where if element i is True, adata[i] will be TFIDF normalized.

Returns

results : dict

Contains keys for each cell class with results from cell class versus all other samples. See scmkl.run for futher details.

Examples

>>> adata = scmkl.create_adata(X = data_mat, 
...                            feature_names = gene_names, 
...                            group_dict = group_dict)
>>>
>>> results = scmkl.one_v_rest(adatas = [adata], names = ['rna'],
...                           alpha_list = np.array([0.05, 0.1]),
...                           tfidf = [False])
>>>
>>> adata.keys()
dict_keys(['B cells', 'Monocytes', 'Dendritic cells', ...])