scmkl.one_v_rest

  1import numpy as np
  2import pandas as pd
  3from sklearn.metrics import f1_score
  4import gc
  5
  6from scmkl.run import run
  7from scmkl.calculate_z import calculate_z
  8from scmkl.multimodal_processing import multimodal_processing
  9from scmkl._checks import _check_adatas
 10
 11
 12def _eval_labels(cell_labels: np.ndarray, train_indices: np.ndarray, 
 13                  test_indices: np.ndarray) -> np.ndarray:
 14    """
 15    Takes an array of multiclass cell labels and returns a unique array 
 16    of cell labels to test for.
 17
 18    Parameters
 19    ----------
 20    cell_labels : np.ndarray
 21        Cell labels that coorespond to an AnnData object.
 22
 23    train_indices : np.ndarray
 24        Indices for the training samples in an AnnData object.
 25    
 26    test_indices : np.ndarray
 27        Indices for the testing samples in an AnnData object.
 28
 29    remove_labels : bool
 30        If `True`, models will only be created for cell labels in both 
 31        the training and test data, if `False`, models will be generated
 32        for all cell labels in the training data.
 33
 34    Returns
 35    -------
 36    uniq_labels : np.ndarray
 37        Returns a numpy array of unique cell labels to be iterated 
 38        through during one versus all setups.
 39    """
 40    train_uniq_labels = np.unique(cell_labels[train_indices])
 41    test_uniq_labels = np.unique(cell_labels[test_indices])
 42
 43    # Getting only labels in both training and testing sets
 44    uniq_labels = np.intersect1d(train_uniq_labels, test_uniq_labels)
 45
 46    # Ensuring that at least one cell type label between the two data
 47    #   are the same
 48    cl_intersect = np.intersect1d(train_uniq_labels, test_uniq_labels)
 49    assert len(cl_intersect) > 0, ("There are no common labels between cells "
 50                                   "in the training and testing samples")
 51
 52    return uniq_labels
 53
 54
 55def get_prob_table(results : dict, alpha: float):
 56    """
 57    Takes a results dictionary with class and probabilities keys and 
 58    returns a table of probabilities for each class and the most 
 59    probable class for each cell.
 60
 61    Parameters
 62    ----------
 63    results : dict
 64        A nested dictionary that contains a dictionary for each class 
 65        containing probabilities for each cell class.
 66
 67    alpha : float
 68        A float for which model probabilities should be evaluated 
 69        for.
 70
 71    Returns
 72    -------
 73    prob_table : pd.DataFrame
 74        Each column is a cell class and the elements are the
 75        class probability outputs from the model.
 76
 77    pred_class : list[str]
 78        The most probable cell classes respective to the training set 
 79        cells. 
 80
 81    low_conf : list[bool]
 82        A bool list where `True`, sample max probability is less than 
 83        0.5.
 84    """
 85    prob_table = {class_ : results[class_]['Probabilities'][alpha][class_]
 86                  for class_ in results.keys()}
 87    prob_table = pd.DataFrame(prob_table)
 88
 89    pred_class = []
 90    maxes = []
 91
 92    for i, row in prob_table.iterrows():
 93        row_max = np.max(row)
 94        indices = np.where(row == row_max)
 95        prediction = prob_table.columns[indices]
 96
 97        if len(prediction) > 1:
 98            prediction = " and ".join(prediction)
 99        else:
100            prediction = prediction[0]
101
102        pred_class.append(prediction)
103        maxes.append(row_max)
104
105    maxes = np.round(maxes, 0)
106    low_conf = np.invert(np.array(maxes, dtype = np.bool_))
107
108    return prob_table, pred_class, low_conf
109
110
111def per_model_summary(results: dict, uniq_labels: np.ndarray | list | tuple, 
112                      alpha: float) -> pd.DataFrame:
113    """
114    Takes the results dictionary from `scmkl.one_v_rest()` and adds a 
115    summary dataframe show metrics for each model generated from the 
116    runs.
117
118    Parameters
119    ----------
120    results : dict
121        Results from `scmkl.one_v_rest()`.
122
123    uniq_labels : array_like
124        Unique cell classes from the runs.
125
126    alpha : float
127        The alpha for creating the summary from.
128
129    Returns
130    -------
131    summary_df : pd.DataFrame
132        Dataframe with classes on rows and metrics as cols.
133    """
134    # Getting metrics availible in results
135    avail_mets = list(results[uniq_labels[0]]['Metrics'][alpha])
136
137    summary_df = {metric : list()
138                  for metric in avail_mets}
139    summary_df['Class'] = uniq_labels
140
141    for lab in summary_df['Class']:
142        for met in avail_mets:
143            val = results[lab]['Metrics'][alpha][met]
144            summary_df[met].append(val)
145
146    return pd.DataFrame(summary_df)
147
148
149def get_class_train(train_indices: np.ndarray,
150                    cell_labels: np.ndarray | list | pd.Series,
151                    seed_obj: np.random._generator.Generator,
152                    other_factor = 1.5):
153    """
154    This function returns a dict with each entry being a set of 
155    training indices for each cell class to be used in 
156    `scmkl.one_v_rest()`.
157
158    Parameters
159    ----------
160    train_indices : np.ndarray
161        The indices in the `ad.AnnData` object of samples availible to 
162        train on.
163
164    cell_labels : array_like
165        The identity of all cells in the anndata object.
166
167    seed_obj : np.random._generator.Generator
168        The seed object used to randomly sample non-target samples.
169
170    other_factor : float
171        The ratio of cells to sample for the other class for each 
172        model. For example, if classifying B cells with 100 B cells in 
173        training, if `other_factor=1`, 100 cells that are not B cells 
174        will be trained on with the B cells.
175
176    Returns
177    -------
178    train_idx : dict
179        Keys are cell classes and values are the train indices to 
180        train scmkl that include both target and non-target samples.
181    """
182    uniq_labels = set(cell_labels)
183    train_idx = dict()
184
185    for lab in uniq_labels:
186        target_pos = np.where(lab == cell_labels[train_indices])[0]
187        overlap = np.isin(target_pos, train_indices)
188
189        target_pos = target_pos[overlap]
190        other_pos = np.setdiff1d(train_indices, target_pos)
191
192        if (other_factor*target_pos.shape[0]) <= other_pos.shape[0]:
193            n_samples = int(other_factor*target_pos.shape[0])
194        else:
195            n_samples = other_pos.shape[0]
196
197        other_pos = seed_obj.choice(other_pos, n_samples, False)
198
199        lab_train = np.concatenate([target_pos, other_pos])
200        train_idx[lab] = lab_train.copy()
201
202    return train_idx
203
204
205def one_v_rest(adatas : list, names : list, alpha_list : np.ndarray, 
206              tfidf : list, batches: int=10, batch_size: int=100, 
207              force_balance: bool=False, other_factor: float=1.0) -> dict:
208    """
209    For each cell class, creates model(s) comparing that class to all 
210    others. Then, predicts on the training data using `scmkl.run()`.
211    Only labels in both training and testing will be run.
212
213    Parameters
214    ----------
215    adatas : list[AnnData]
216        List of `ad.AnnData` objects created by `create_adata()` 
217        where each `ad.AnnData` is one modality and composed of both 
218        training and testing samples. Requires that `'train_indices'`
219        and `'test_indices'` are the same between all `ad.AnnData`s.
220
221    names : list[str]
222        String variables that describe each modality respective to 
223        `adatas` for labeling.
224        
225    alpha_list : np.ndarray | float
226        An array of alpha values to create each model with or a float 
227        to run with a single alpha.
228
229    tfidf : list[bool]
230        If element `i` is `True`, `adatas[i]` will be TF-IDF 
231        normalized.
232
233    batches : int
234        The number of batches to use for the distance calculation. 
235        This will average the result of `batches` distance calculations 
236        of `batch_size` randomly sampled cells. More batches will 
237        converge to population distance values at the cost of 
238        scalability.
239
240    batch_size : int
241        The number of cells to include per batch for distance
242        calculations. Higher batch size will converge to population
243        distance values at the cost of scalability.
244        If `batches*batch_size > num_training_cells`,
245        `batch_size` will be reduced to 
246        `int(num_training_cells / batches)`.
247
248    force_balance : bool
249        If `True`, training sets will be balanced to reduce class label 
250        imbalance. Defaults to `False`.
251
252    other_factor : float
253        The ratio of cells to sample for the other class for each 
254        model. For example, if classifying B cells with 100 B cells in 
255        training, if `other_factor=1`, 100 cells that are not B cells 
256        will be trained on with the B cells.
257
258    Returns
259    -------
260    results : dict
261        Contains keys for each cell class with results from cell class
262        versus all other samples. See `scmkl.run()` for futher details. 
263        Will also include a probablilities table with the predictions 
264        from each model.
265
266    Examples
267    --------
268    >>> adata = scmkl.create_adata(X = data_mat, 
269    ...                            feature_names = gene_names, 
270    ...                            group_dict = group_dict)
271    >>>
272    >>> results = scmkl.one_v_rest(adatas = [adata], names = ['rna'],
273    ...                           alpha_list = np.array([0.05, 0.1]),
274    ...                           tfidf = [False])
275    >>>
276    >>> adata.keys()
277    dict_keys(['B cells', 'Monocytes', 'Dendritic cells', ...])
278    """
279    # Formatting checks ensuring all adata elements are 
280    # AnnData objects and train/test indices are all the same
281    _check_adatas(adatas, check_obs = True, check_uns = True)
282
283
284    # Extracting train and test indices
285    train_indices = adatas[0].uns['train_indices']
286    test_indices = adatas[0].uns['test_indices']
287
288    # Checking and capturing cell labels
289    uniq_labels = _eval_labels(cell_labels = adatas[0].obs['labels'], 
290                               train_indices = train_indices,
291                               test_indices = test_indices)
292
293
294    # Calculating Z matrices, method depends on whether there are multiple 
295    # adatas (modalities)
296    if (len(adatas) == 1) and ('Z_train' not in adatas[0].uns.keys()):
297        adata = calculate_z(adata, n_features = 5000, batches=batches, batch_size=batch_size)
298    elif len(adatas) > 1:
299        adata = multimodal_processing(adatas = adatas, 
300                                      names = names, 
301                                      tfidf = tfidf,
302                                      batches=batches,
303                                      batch_size=batch_size)
304    else:
305        adata = adatas[0].copy()
306
307    del adatas
308    gc.collect()
309
310    # Initializing for capturing model outputs
311    results = dict()
312
313    # Capturing cell labels before overwriting
314    cell_labels = np.array(adata.obs['labels'].copy())
315
316    # Capturing perfect train/test splits for each class
317    if force_balance:
318        train_idx = get_class_train(adata.uns['train_indices'], 
319                                    cell_labels, 
320                                    adata.uns['seed_obj'],
321                                    other_factor)
322
323    for label in uniq_labels:
324
325        print(f"Comparing {label} to other types", flush = True)
326        cur_labels = cell_labels.copy()
327        cur_labels[cell_labels != label] = 'other'
328
329        # Replacing cell labels for current cell type vs rest
330        adata.obs['labels'] = cur_labels
331
332        if force_balance:
333            adata.uns['train_indices'] = train_idx[label]
334
335        # Running scMKL
336        results[label] = run(adata, alpha_list, return_probs = True)
337
338    # Getting final predictions
339    alpha = np.min(alpha_list)
340    prob_table, pred_class, low_conf = get_prob_table(results, alpha)
341    macro_f1 = f1_score(cell_labels[adata.uns['test_indices']], 
342                        pred_class, average='macro')
343
344    model_summary = per_model_summary(results, uniq_labels, alpha)
345
346    results['Per_model_summary'] = model_summary
347    results['Classes'] = uniq_labels
348    results['Probability_table'] = prob_table
349    results['Predicted_class'] = pred_class
350    results['Truth_labels'] = cell_labels[adata.uns['test_indices']]
351    results['Low_confidence'] = low_conf
352    results['Macro_F1-Score'] = macro_f1
353
354    if force_balance:
355        results['Training_indices'] = train_idx
356
357    return results
def get_prob_table(results: dict, alpha: float):
 56def get_prob_table(results : dict, alpha: float):
 57    """
 58    Takes a results dictionary with class and probabilities keys and 
 59    returns a table of probabilities for each class and the most 
 60    probable class for each cell.
 61
 62    Parameters
 63    ----------
 64    results : dict
 65        A nested dictionary that contains a dictionary for each class 
 66        containing probabilities for each cell class.
 67
 68    alpha : float
 69        A float for which model probabilities should be evaluated 
 70        for.
 71
 72    Returns
 73    -------
 74    prob_table : pd.DataFrame
 75        Each column is a cell class and the elements are the
 76        class probability outputs from the model.
 77
 78    pred_class : list[str]
 79        The most probable cell classes respective to the training set 
 80        cells. 
 81
 82    low_conf : list[bool]
 83        A bool list where `True`, sample max probability is less than 
 84        0.5.
 85    """
 86    prob_table = {class_ : results[class_]['Probabilities'][alpha][class_]
 87                  for class_ in results.keys()}
 88    prob_table = pd.DataFrame(prob_table)
 89
 90    pred_class = []
 91    maxes = []
 92
 93    for i, row in prob_table.iterrows():
 94        row_max = np.max(row)
 95        indices = np.where(row == row_max)
 96        prediction = prob_table.columns[indices]
 97
 98        if len(prediction) > 1:
 99            prediction = " and ".join(prediction)
100        else:
101            prediction = prediction[0]
102
103        pred_class.append(prediction)
104        maxes.append(row_max)
105
106    maxes = np.round(maxes, 0)
107    low_conf = np.invert(np.array(maxes, dtype = np.bool_))
108
109    return prob_table, pred_class, low_conf

Takes a results dictionary with class and probabilities keys and returns a table of probabilities for each class and the most probable class for each cell.

Parameters
  • results (dict): A nested dictionary that contains a dictionary for each class containing probabilities for each cell class.
  • alpha (float): A float for which model probabilities should be evaluated for.
Returns
  • prob_table (pd.DataFrame): Each column is a cell class and the elements are the class probability outputs from the model.
  • pred_class (list[str]): The most probable cell classes respective to the training set cells.
  • low_conf (list[bool]): A bool list where True, sample max probability is less than 0.5.
def per_model_summary( results: dict, uniq_labels: numpy.ndarray | list | tuple, alpha: float) -> pandas.core.frame.DataFrame:
112def per_model_summary(results: dict, uniq_labels: np.ndarray | list | tuple, 
113                      alpha: float) -> pd.DataFrame:
114    """
115    Takes the results dictionary from `scmkl.one_v_rest()` and adds a 
116    summary dataframe show metrics for each model generated from the 
117    runs.
118
119    Parameters
120    ----------
121    results : dict
122        Results from `scmkl.one_v_rest()`.
123
124    uniq_labels : array_like
125        Unique cell classes from the runs.
126
127    alpha : float
128        The alpha for creating the summary from.
129
130    Returns
131    -------
132    summary_df : pd.DataFrame
133        Dataframe with classes on rows and metrics as cols.
134    """
135    # Getting metrics availible in results
136    avail_mets = list(results[uniq_labels[0]]['Metrics'][alpha])
137
138    summary_df = {metric : list()
139                  for metric in avail_mets}
140    summary_df['Class'] = uniq_labels
141
142    for lab in summary_df['Class']:
143        for met in avail_mets:
144            val = results[lab]['Metrics'][alpha][met]
145            summary_df[met].append(val)
146
147    return pd.DataFrame(summary_df)

Takes the results dictionary from scmkl.one_v_rest and adds a summary dataframe show metrics for each model generated from the runs.

Parameters
  • results (dict): Results from scmkl.one_v_rest.
  • uniq_labels (array_like): Unique cell classes from the runs.
  • alpha (float): The alpha for creating the summary from.
Returns
  • summary_df (pd.DataFrame): Dataframe with classes on rows and metrics as cols.
def get_class_train( train_indices: numpy.ndarray, cell_labels: numpy.ndarray | list | pandas.core.series.Series, seed_obj: numpy.random._generator.Generator, other_factor=1.5):
150def get_class_train(train_indices: np.ndarray,
151                    cell_labels: np.ndarray | list | pd.Series,
152                    seed_obj: np.random._generator.Generator,
153                    other_factor = 1.5):
154    """
155    This function returns a dict with each entry being a set of 
156    training indices for each cell class to be used in 
157    `scmkl.one_v_rest()`.
158
159    Parameters
160    ----------
161    train_indices : np.ndarray
162        The indices in the `ad.AnnData` object of samples availible to 
163        train on.
164
165    cell_labels : array_like
166        The identity of all cells in the anndata object.
167
168    seed_obj : np.random._generator.Generator
169        The seed object used to randomly sample non-target samples.
170
171    other_factor : float
172        The ratio of cells to sample for the other class for each 
173        model. For example, if classifying B cells with 100 B cells in 
174        training, if `other_factor=1`, 100 cells that are not B cells 
175        will be trained on with the B cells.
176
177    Returns
178    -------
179    train_idx : dict
180        Keys are cell classes and values are the train indices to 
181        train scmkl that include both target and non-target samples.
182    """
183    uniq_labels = set(cell_labels)
184    train_idx = dict()
185
186    for lab in uniq_labels:
187        target_pos = np.where(lab == cell_labels[train_indices])[0]
188        overlap = np.isin(target_pos, train_indices)
189
190        target_pos = target_pos[overlap]
191        other_pos = np.setdiff1d(train_indices, target_pos)
192
193        if (other_factor*target_pos.shape[0]) <= other_pos.shape[0]:
194            n_samples = int(other_factor*target_pos.shape[0])
195        else:
196            n_samples = other_pos.shape[0]
197
198        other_pos = seed_obj.choice(other_pos, n_samples, False)
199
200        lab_train = np.concatenate([target_pos, other_pos])
201        train_idx[lab] = lab_train.copy()
202
203    return train_idx

This function returns a dict with each entry being a set of training indices for each cell class to be used in scmkl.one_v_rest.

Parameters
  • train_indices (np.ndarray): The indices in the ad.AnnData object of samples availible to train on.
  • cell_labels (array_like): The identity of all cells in the anndata object.
  • seed_obj (np.random._generator.Generator): The seed object used to randomly sample non-target samples.
  • other_factor (float): The ratio of cells to sample for the other class for each model. For example, if classifying B cells with 100 B cells in training, if other_factor=1, 100 cells that are not B cells will be trained on with the B cells.
Returns
  • train_idx (dict): Keys are cell classes and values are the train indices to train scmkl that include both target and non-target samples.
def one_v_rest( adatas: list, names: list, alpha_list: numpy.ndarray, tfidf: list, batches: int = 10, batch_size: int = 100, force_balance: bool = False, other_factor: float = 1.0) -> dict:
206def one_v_rest(adatas : list, names : list, alpha_list : np.ndarray, 
207              tfidf : list, batches: int=10, batch_size: int=100, 
208              force_balance: bool=False, other_factor: float=1.0) -> dict:
209    """
210    For each cell class, creates model(s) comparing that class to all 
211    others. Then, predicts on the training data using `scmkl.run()`.
212    Only labels in both training and testing will be run.
213
214    Parameters
215    ----------
216    adatas : list[AnnData]
217        List of `ad.AnnData` objects created by `create_adata()` 
218        where each `ad.AnnData` is one modality and composed of both 
219        training and testing samples. Requires that `'train_indices'`
220        and `'test_indices'` are the same between all `ad.AnnData`s.
221
222    names : list[str]
223        String variables that describe each modality respective to 
224        `adatas` for labeling.
225        
226    alpha_list : np.ndarray | float
227        An array of alpha values to create each model with or a float 
228        to run with a single alpha.
229
230    tfidf : list[bool]
231        If element `i` is `True`, `adatas[i]` will be TF-IDF 
232        normalized.
233
234    batches : int
235        The number of batches to use for the distance calculation. 
236        This will average the result of `batches` distance calculations 
237        of `batch_size` randomly sampled cells. More batches will 
238        converge to population distance values at the cost of 
239        scalability.
240
241    batch_size : int
242        The number of cells to include per batch for distance
243        calculations. Higher batch size will converge to population
244        distance values at the cost of scalability.
245        If `batches*batch_size > num_training_cells`,
246        `batch_size` will be reduced to 
247        `int(num_training_cells / batches)`.
248
249    force_balance : bool
250        If `True`, training sets will be balanced to reduce class label 
251        imbalance. Defaults to `False`.
252
253    other_factor : float
254        The ratio of cells to sample for the other class for each 
255        model. For example, if classifying B cells with 100 B cells in 
256        training, if `other_factor=1`, 100 cells that are not B cells 
257        will be trained on with the B cells.
258
259    Returns
260    -------
261    results : dict
262        Contains keys for each cell class with results from cell class
263        versus all other samples. See `scmkl.run()` for futher details. 
264        Will also include a probablilities table with the predictions 
265        from each model.
266
267    Examples
268    --------
269    >>> adata = scmkl.create_adata(X = data_mat, 
270    ...                            feature_names = gene_names, 
271    ...                            group_dict = group_dict)
272    >>>
273    >>> results = scmkl.one_v_rest(adatas = [adata], names = ['rna'],
274    ...                           alpha_list = np.array([0.05, 0.1]),
275    ...                           tfidf = [False])
276    >>>
277    >>> adata.keys()
278    dict_keys(['B cells', 'Monocytes', 'Dendritic cells', ...])
279    """
280    # Formatting checks ensuring all adata elements are 
281    # AnnData objects and train/test indices are all the same
282    _check_adatas(adatas, check_obs = True, check_uns = True)
283
284
285    # Extracting train and test indices
286    train_indices = adatas[0].uns['train_indices']
287    test_indices = adatas[0].uns['test_indices']
288
289    # Checking and capturing cell labels
290    uniq_labels = _eval_labels(cell_labels = adatas[0].obs['labels'], 
291                               train_indices = train_indices,
292                               test_indices = test_indices)
293
294
295    # Calculating Z matrices, method depends on whether there are multiple 
296    # adatas (modalities)
297    if (len(adatas) == 1) and ('Z_train' not in adatas[0].uns.keys()):
298        adata = calculate_z(adata, n_features = 5000, batches=batches, batch_size=batch_size)
299    elif len(adatas) > 1:
300        adata = multimodal_processing(adatas = adatas, 
301                                      names = names, 
302                                      tfidf = tfidf,
303                                      batches=batches,
304                                      batch_size=batch_size)
305    else:
306        adata = adatas[0].copy()
307
308    del adatas
309    gc.collect()
310
311    # Initializing for capturing model outputs
312    results = dict()
313
314    # Capturing cell labels before overwriting
315    cell_labels = np.array(adata.obs['labels'].copy())
316
317    # Capturing perfect train/test splits for each class
318    if force_balance:
319        train_idx = get_class_train(adata.uns['train_indices'], 
320                                    cell_labels, 
321                                    adata.uns['seed_obj'],
322                                    other_factor)
323
324    for label in uniq_labels:
325
326        print(f"Comparing {label} to other types", flush = True)
327        cur_labels = cell_labels.copy()
328        cur_labels[cell_labels != label] = 'other'
329
330        # Replacing cell labels for current cell type vs rest
331        adata.obs['labels'] = cur_labels
332
333        if force_balance:
334            adata.uns['train_indices'] = train_idx[label]
335
336        # Running scMKL
337        results[label] = run(adata, alpha_list, return_probs = True)
338
339    # Getting final predictions
340    alpha = np.min(alpha_list)
341    prob_table, pred_class, low_conf = get_prob_table(results, alpha)
342    macro_f1 = f1_score(cell_labels[adata.uns['test_indices']], 
343                        pred_class, average='macro')
344
345    model_summary = per_model_summary(results, uniq_labels, alpha)
346
347    results['Per_model_summary'] = model_summary
348    results['Classes'] = uniq_labels
349    results['Probability_table'] = prob_table
350    results['Predicted_class'] = pred_class
351    results['Truth_labels'] = cell_labels[adata.uns['test_indices']]
352    results['Low_confidence'] = low_conf
353    results['Macro_F1-Score'] = macro_f1
354
355    if force_balance:
356        results['Training_indices'] = train_idx
357
358    return results

For each cell class, creates model(s) comparing that class to all others. Then, predicts on the training data using scmkl.run. Only labels in both training and testing will be run.

Parameters
  • adatas (list[AnnData]): List of ad.AnnData objects created by create_adata() where each ad.AnnData is one modality and composed of both training and testing samples. Requires that 'train_indices' and 'test_indices' are the same between all ad.AnnDatas.
  • names (list[str]): String variables that describe each modality respective to adatas for labeling.
  • alpha_list (np.ndarray | float): An array of alpha values to create each model with or a float to run with a single alpha.
  • tfidf (list[bool]): If element i is True, adatas[i] will be TF-IDF normalized.
  • batches (int): The number of batches to use for the distance calculation. This will average the result of batches distance calculations of batch_size randomly sampled cells. More batches will converge to population distance values at the cost of scalability.
  • batch_size (int): The number of cells to include per batch for distance calculations. Higher batch size will converge to population distance values at the cost of scalability. If batches*batch_size > num_training_cells, batch_size will be reduced to int(num_training_cells / batches).
  • force_balance (bool): If True, training sets will be balanced to reduce class label imbalance. Defaults to False.
  • other_factor (float): The ratio of cells to sample for the other class for each model. For example, if classifying B cells with 100 B cells in training, if other_factor=1, 100 cells that are not B cells will be trained on with the B cells.
Returns
  • results (dict): Contains keys for each cell class with results from cell class versus all other samples. See scmkl.run for futher details. Will also include a probablilities table with the predictions from each model.
Examples
>>> adata = scmkl.create_adata(X = data_mat, 
...                            feature_names = gene_names, 
...                            group_dict = group_dict)
>>>
>>> results = scmkl.one_v_rest(adatas = [adata], names = ['rna'],
...                           alpha_list = np.array([0.05, 0.1]),
...                           tfidf = [False])
>>>
>>> adata.keys()
dict_keys(['B cells', 'Monocytes', 'Dendritic cells', ...])