scmkl.one_v_rest
1import numpy as np 2import pandas as pd 3import gc 4 5from scmkl.run import run 6from scmkl.estimate_sigma import estimate_sigma 7from scmkl.calculate_z import calculate_z 8from scmkl.multimodal_processing import multimodal_processing 9from scmkl._checks import _check_adatas 10 11 12def _eval_labels(cell_labels : np.ndarray, train_indices : np.ndarray, 13 test_indices : np.ndarray) -> np.ndarray: 14 ''' 15 Takes an array of multiclass cell labels and returns a unique array 16 of cell labels to test for. 17 18 Parameters 19 ---------- 20 cell_labels : np.ndarray 21 > Cell labels that coorespond to an AnnData object. 22 23 train_indices : np.ndarray 24 > Indices for the training samples in an AnnData object. 25 26 test_indices - np.ndarray 27 > Indices for the testing samples in an AnnData object. 28 29 remove_labels : bool 30 > If True, models will only be created for cell labels in both 31 the training and test data, if False, models will be generated 32 for all cell labels in the training data. 33 34 Returns 35 ------- 36 uniq_labels : np.ndarray 37 > Returns a numpy array of unique cell labels to be iterated 38 through during one versus all setups. 39 ''' 40 train_uniq_labels = np.unique(cell_labels[train_indices]) 41 test_uniq_labels = np.unique(cell_labels[test_indices]) 42 43 # Getting only labels in both training and testing sets 44 uniq_labels = np.intersect1d(train_uniq_labels, test_uniq_labels) 45 46 # Ensuring that at least one cell type label between the two data 47 # are the same 48 cl_intersect = np.intersect1d(train_uniq_labels, test_uniq_labels) 49 assert len(cl_intersect) > 0, ("There are no common labels between cells " 50 "in the training and testing samples") 51 52 return uniq_labels 53 54 55def _prob_table(results : dict, alpha): 56 ''' 57 Takes a results dictionary with class and probabilities keys and 58 returns a table of probabilities for each class and the most 59 probable class for each cell. 60 61 Parameters 62 ---------- 63 results : dict 64 > A nested dictionary that contains a dictionary for each class 65 containing probabilities for each cell class. 66 67 alpha : float 68 > A float for which model probabilities should be evaluated 69 for. 70 71 Returns 72 ------- 73 prob_table : pd.DataFrame 74 > Each column is a cell class and the elements are the 75 class probability outputs from the model. 76 77 pred_class : list 78 > The most probable cell classes respective to the training set 79 cells. 80 ''' 81 prob_table = {class_ : results[class_]['Probabilities'][alpha][class_] 82 for class_ in results.keys()} 83 prob_table = pd.DataFrame(prob_table) 84 85 pred_class = [] 86 maxes = [] 87 88 for i, row in prob_table.iterrows(): 89 row_max = np.max(row) 90 indices = np.where(row == row_max) 91 prediction = prob_table.columns[indices] 92 93 if len(prediction) > 1: 94 prediction = " and ".join(prediction) 95 else: 96 prediction = prediction[0] 97 98 pred_class.append(prediction) 99 maxes.append(row_max) 100 101 maxes = np.round(maxes, 0) 102 low_conf = np.invert(np.array(maxes, dtype = np.bool_)) 103 104 return prob_table, pred_class, low_conf 105 106 107def one_v_rest(adatas : list, names : list, alpha_list : np.ndarray, 108 tfidf : list) -> dict: 109 ''' 110 For each cell class, creates model(s) comparing that class to all 111 others. Then, predicts on the training data using `scmkl.run()`. 112 Only labels in both training and testing will be run. 113 114 Parameters 115 ---------- 116 **adatas** : *list[AnnData]* 117 > List of AnnData objects created by create_adata() 118 where each AnnData is one modality and composed of both 119 training and testing samples. Requires that `'train_indices'` 120 and `'test_indices'` are the same across all AnnDatas. 121 122 **names** : *list[str]* 123 > List of string variables that describe each modality 124 respective to adatas for labeling. 125 126 **alpha_list** : *np.ndarray* | *float* 127 > An array of alpha values to create each model with. 128 129 **tfidf** : *list[bool]* 130 > List where if element i is `True`, adata[i] will be TFIDF 131 normalized. 132 133 Returns 134 ------- 135 **results** : *dict* 136 > Contains keys for each cell class with results from cell class 137 versus all other samples. See `scmkl.run()` for futher details. 138 139 Examples 140 -------- 141 >>> adata = scmkl.create_adata(X = data_mat, 142 ... feature_names = gene_names, 143 ... group_dict = group_dict) 144 >>> 145 >>> results = scmkl.one_v_rest(adatas = [adata], names = ['rna'], 146 ... alpha_list = np.array([0.05, 0.1]), 147 ... tfidf = [False]) 148 >>> 149 >>> adata.keys() 150 dict_keys(['B cells', 'Monocytes', 'Dendritic cells', ...]) 151 ''' 152 # Formatting checks ensuring all adata elements are 153 # AnnData objects and train/test indices are all the same 154 _check_adatas(adatas, check_obs = True, check_uns = True) 155 156 # Extracting train and test indices 157 train_indices = adatas[0].uns['train_indices'] 158 test_indices = adatas[0].uns['test_indices'] 159 160 # Checking and capturing cell labels 161 uniq_labels = _eval_labels( cell_labels = adatas[0].obs['labels'], 162 train_indices = train_indices, 163 test_indices = test_indices) 164 165 166 # Calculating Z matrices, method depends on whether there are multiple 167 # adatas (modalities) 168 if len(adatas) == 1: 169 adata = estimate_sigma(adatas[0], n_features = 200) 170 adata = calculate_z(adata, n_features = 5000) 171 else: 172 adata = multimodal_processing(adatas = adatas, 173 names = names, 174 tfidf = tfidf, 175 z_calculation = True) 176 177 del adatas 178 gc.collect() 179 180 # Initializing for capturing model outputs 181 results = {} 182 183 # Capturing cell labels before overwriting 184 cell_labels = np.array(adata.obs['labels']) 185 186 for label in uniq_labels: 187 print(f"Comparing {label} to other types", flush = True) 188 cur_labels = cell_labels.copy() 189 cur_labels[cell_labels != label] = 'other' 190 191 # Replacing cell labels for current cell type vs rest 192 adata.obs['labels'] = cur_labels 193 194 # Running scMKL 195 results[label] = run(adata, alpha_list, return_probs = True) 196 197 # Getting final predictions 198 alpha = np.min(alpha_list) 199 prob_table, pred_class, low_conf = _prob_table(results, alpha) 200 201 results['Probability_table'] = prob_table 202 results['Predicted_class'] = pred_class 203 results['Truth_labels'] = cell_labels[adata.uns['test_indices']] 204 results['Low_confidence'] = low_conf 205 206 return results
108def one_v_rest(adatas : list, names : list, alpha_list : np.ndarray, 109 tfidf : list) -> dict: 110 ''' 111 For each cell class, creates model(s) comparing that class to all 112 others. Then, predicts on the training data using `scmkl.run()`. 113 Only labels in both training and testing will be run. 114 115 Parameters 116 ---------- 117 **adatas** : *list[AnnData]* 118 > List of AnnData objects created by create_adata() 119 where each AnnData is one modality and composed of both 120 training and testing samples. Requires that `'train_indices'` 121 and `'test_indices'` are the same across all AnnDatas. 122 123 **names** : *list[str]* 124 > List of string variables that describe each modality 125 respective to adatas for labeling. 126 127 **alpha_list** : *np.ndarray* | *float* 128 > An array of alpha values to create each model with. 129 130 **tfidf** : *list[bool]* 131 > List where if element i is `True`, adata[i] will be TFIDF 132 normalized. 133 134 Returns 135 ------- 136 **results** : *dict* 137 > Contains keys for each cell class with results from cell class 138 versus all other samples. See `scmkl.run()` for futher details. 139 140 Examples 141 -------- 142 >>> adata = scmkl.create_adata(X = data_mat, 143 ... feature_names = gene_names, 144 ... group_dict = group_dict) 145 >>> 146 >>> results = scmkl.one_v_rest(adatas = [adata], names = ['rna'], 147 ... alpha_list = np.array([0.05, 0.1]), 148 ... tfidf = [False]) 149 >>> 150 >>> adata.keys() 151 dict_keys(['B cells', 'Monocytes', 'Dendritic cells', ...]) 152 ''' 153 # Formatting checks ensuring all adata elements are 154 # AnnData objects and train/test indices are all the same 155 _check_adatas(adatas, check_obs = True, check_uns = True) 156 157 # Extracting train and test indices 158 train_indices = adatas[0].uns['train_indices'] 159 test_indices = adatas[0].uns['test_indices'] 160 161 # Checking and capturing cell labels 162 uniq_labels = _eval_labels( cell_labels = adatas[0].obs['labels'], 163 train_indices = train_indices, 164 test_indices = test_indices) 165 166 167 # Calculating Z matrices, method depends on whether there are multiple 168 # adatas (modalities) 169 if len(adatas) == 1: 170 adata = estimate_sigma(adatas[0], n_features = 200) 171 adata = calculate_z(adata, n_features = 5000) 172 else: 173 adata = multimodal_processing(adatas = adatas, 174 names = names, 175 tfidf = tfidf, 176 z_calculation = True) 177 178 del adatas 179 gc.collect() 180 181 # Initializing for capturing model outputs 182 results = {} 183 184 # Capturing cell labels before overwriting 185 cell_labels = np.array(adata.obs['labels']) 186 187 for label in uniq_labels: 188 print(f"Comparing {label} to other types", flush = True) 189 cur_labels = cell_labels.copy() 190 cur_labels[cell_labels != label] = 'other' 191 192 # Replacing cell labels for current cell type vs rest 193 adata.obs['labels'] = cur_labels 194 195 # Running scMKL 196 results[label] = run(adata, alpha_list, return_probs = True) 197 198 # Getting final predictions 199 alpha = np.min(alpha_list) 200 prob_table, pred_class, low_conf = _prob_table(results, alpha) 201 202 results['Probability_table'] = prob_table 203 results['Predicted_class'] = pred_class 204 results['Truth_labels'] = cell_labels[adata.uns['test_indices']] 205 results['Low_confidence'] = low_conf 206 207 return results
For each cell class, creates model(s) comparing that class to all
others. Then, predicts on the training data using scmkl.run
.
Only labels in both training and testing will be run.
Parameters
adatas : list[AnnData]
List of AnnData objects created by create_adata() where each AnnData is one modality and composed of both training and testing samples. Requires that
'train_indices'
and'test_indices'
are the same across all AnnDatas.
names : list[str]
List of string variables that describe each modality respective to adatas for labeling.
alpha_list : np.ndarray | float
An array of alpha values to create each model with.
tfidf : list[bool]
List where if element i is
True
, adata[i] will be TFIDF normalized.
Returns
results : dict
Contains keys for each cell class with results from cell class versus all other samples. See
scmkl.run
for futher details.
Examples
>>> adata = scmkl.create_adata(X = data_mat,
... feature_names = gene_names,
... group_dict = group_dict)
>>>
>>> results = scmkl.one_v_rest(adatas = [adata], names = ['rna'],
... alpha_list = np.array([0.05, 0.1]),
... tfidf = [False])
>>>
>>> adata.keys()
dict_keys(['B cells', 'Monocytes', 'Dendritic cells', ...])