scmkl.create_adata
1import numpy as np 2import anndata as ad 3 4 5def _filter_features(X, feature_names, group_dict, remove_features): 6 ''' 7 Function to remove unused features from X matrix. Any features not 8 included in group_dict will be removed from the matrix. Also puts 9 the features in the same relative order (of included features) 10 11 Parameters 12 ---------- 13 X : Data array. Can be Numpy array or Scipy Sparse Array 14 feature_names : Numpy array of corresponding feature names 15 group_dict : Dictionary containing feature grouping information. 16 Example: {geneset: np.array(gene_1, gene_2, ..., 17 gene_n)} 18 Returns 19 ------- 20 X : Data array containing data only for features in the group_dict 21 feature_names : Numpy array of corresponding feature names from 22 group_dict 23 ''' 24 assert X.shape[1] == len(feature_names), ("Given features do not " 25 "correspond with features in X") 26 27 group_features = set() 28 feature_set = set(feature_names) 29 30 # Store all objects in dictionary in set 31 for group in group_dict.keys(): 32 group_features.update(set(group_dict[group])) 33 34 # Finds intersection between group features and features in data 35 # Converts to nd.array and sorts to preserve order of feature names 36 group_feats = list(feature_set.intersection(set(group_dict[group]))) 37 group_dict[group] = np.sort(np.array(group_feats)) 38 39 # Only keeping groupings that have at least two features 40 group_dict = {group : group_dict[group] for group in group_dict.keys() 41 if len(group_dict[group]) > 1} 42 43 if remove_features: 44 # Find location of desired features in whole feature set 45 g_features = np.array(list(group_features)) 46 group_feature_indices = np.where(np.in1d(feature_names, 47 g_features, 48 assume_unique = True))[0] 49 50 # Subset only the desired features and data 51 X = X[:,group_feature_indices] 52 feature_names = np.array(list(feature_names))[group_feature_indices] 53 54 return X, feature_names, group_dict 55 56 57def _multi_class_split(y, train_ratio = 0.8, class_threshold = 'median', 58 seed_obj = np.random.default_rng(100)): 59 ''' 60 Function for calculating the training and testing cell positions 61 for multiclass data sets. 62 63 Parameters 64 ---------- 65 **y** : *np.ndarray* | *pd.Series* | *list* 66 > Should be an iterable object cooresponding to samples in 67 `ad.AnnData` object. 68 69 **seed_obj** : *numpy.random._generator.Generator* 70 > Seed used to randomly sample and split data. 71 72 **train_ratio** : *float* 73 > Ratio of number of training samples to entire data set. 74 Note: if a threshold is applied, the ratio training samples 75 may decrease depending on class balance and `class_threshold` 76 parameter. 77 78 **class_threshold** : *str* | *int* 79 > If is type `int`, classes with more samples than 80 class_threshold will be sampled. If `'median'`, 81 samples will be sampled to the median number of samples per 82 class. 83 84 Returns 85 ------- 86 **train_indices** : *np.ndarray* 87 > Indices for training samples. 88 89 **test_indices** : *np.ndarray* 90 > Indices for testing samples. 91 ''' 92 uniq_labels = np.unique(y) 93 94 # Finding indices for each cell class 95 class_positions = {class_ : np.where(y == class_)[0] 96 for class_ in uniq_labels} 97 98 # Capturing training indices while maintaining original class proportions 99 train_samples = {class_ : seed_obj.choice(class_positions[class_], 100 int(len(class_positions[class_]) 101 * train_ratio), 102 replace = False) 103 for class_ in class_positions.keys()} 104 105 # Capturing testing indices while maintaining original class proportions 106 test_samples = {class_ : np.setdiff1d(class_positions[class_], 107 train_samples[class_]) 108 for class_ in class_positions.keys()} 109 110 # Applying threshold for samples per class 111 if class_threshold == 'median': 112 all_train = [idx for class_ in train_samples.keys() 113 for idx in train_samples[class_]] 114 _, class_threshold = np.unique(y[all_train], return_counts = True) 115 class_threshold = int(np.median(class_threshold)) 116 117 for class_ in train_samples.keys(): 118 if len(train_samples[class_]) > class_threshold: 119 train_samples[class_] = seed_obj.choice(train_samples[class_], 120 class_threshold) 121 122 train_indices = np.array([idx for class_ in train_samples.keys() 123 for idx in train_samples[class_]]) 124 125 test_indices = np.array([idx for class_ in test_samples.keys() 126 for idx in test_samples[class_]]) 127 128 return train_indices, test_indices 129 130 131def _binary_split(y, train_indices = None, train_ratio = 0.8, 132 seed_obj = np.random.default_rng(100)): 133 ''' 134 Function to calculate training and testing indices for given 135 dataset. If train indices are given, it will calculate the test 136 indices. If train_indices == None, then it calculates both indices, 137 preserving the ratio of each label in y 138 139 Parameters 140 ---------- 141 y : Numpy array of cell labels. Can have any number of classes for 142 this function. 143 train_indices : Optional array of pre-determined training indices 144 seed_obj : Numpy random state used for random processes. Can be 145 specified for reproducubility or set by default. 146 train_ratio : decimal value ratio of features in training/testing 147 sets 148 149 Returns 150 ------- 151 train_indices : Array of indices of training cells 152 test_indices : Array of indices of testing cells 153 ''' 154 155 # If train indices aren't provided 156 if train_indices is None: 157 158 unique_labels = np.unique(y) 159 train_indices = [] 160 161 for label in unique_labels: 162 163 # Find index of each unique label 164 label_indices = np.where(y == label)[0] 165 166 # Sample these indices according to train ratio 167 n = int(len(label_indices) * train_ratio) 168 train_label_indices = seed_obj.choice(label_indices, n, 169 replace = False) 170 train_indices.extend(train_label_indices) 171 else: 172 assert len(train_indices) <= len(y), ("More train indices than there " 173 "are samples") 174 175 train_indices = np.array(train_indices) 176 177 # Test indices are the indices not in the train_indices 178 test_indices = np.setdiff1d(np.arange(len(y)), train_indices, 179 assume_unique = True) 180 181 return train_indices, test_indices 182 183 184def calculate_d(num_samples : int): 185 ''' 186 This function calculates the optimal number of dimensions for 187 performance. See https://doi.org/10.48550/arXiv.1806.09178 for more 188 information. 189 190 Parameters 191 ---------- 192 **num_samples** : *int* 193 > The number of samples in the data set including both training 194 and testing sets. 195 196 Returns 197 ------- 198 **d** : *int* 199 > The optimal number of dimensions to run scMKL with the given 200 data set. 201 202 Examples 203 -------- 204 >>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz') 205 >>> d = scmkl.calculate_d(raw_counts.shape[0]) 206 >>> d 207 161 208 ''' 209 d = int(np.sqrt(num_samples) * np.log(np.log(num_samples))) 210 return d 211 212 213def create_adata(X, feature_names: np.ndarray, cell_labels: np.ndarray, 214 group_dict: dict, scale_data: bool = True, 215 split_data : np.ndarray | None = None, D : int | None = None, 216 remove_features = True, train_ratio = 0.8, 217 distance_metric = 'euclidean', kernel_type = 'Gaussian', 218 random_state : int = 1, allow_multiclass : bool = False, 219 class_threshold : str | int = 'median'): 220 ''' 221 Function to create an AnnData object to carry all relevant 222 information going forward. 223 224 Parameters 225 ---------- 226 **X** : *scipy.sparse.csc_matrix* | *np.ndarray* | 227 *pd.DataFrame* 228 > A data matrix of cells by features (sparse array 229 recommended for large datasets). 230 231 **feature_names** : *np.ndarray* 232 > array of feature names corresponding with the features 233 in X. 234 235 **cell_labels** : *np.ndarray* 236 > A numpy array of cell phenotypes corresponding with 237 the cells in X. 238 239 **group_dict** : *dict* 240 > Dictionary containing feature grouping information. 241 - Example: {geneset: np.array([gene_1, gene_2, ..., 242 gene_n])} 243 244 **scale_data** : *bool* 245 > If `True`, data matrix is log transformed and standard 246 scaled. 247 248 **split_data** : *None* | *np.ndarray* 249 > If *None*, data will be split stratified by cell labels. 250 Else, is an array of precalculated train/test split 251 corresponding to samples. Can include labels for entire 252 dataset to benchmark performance or for only training 253 data to classify unknown cell types. 254 - Example: np.array(['train', 'test', ..., 'train']) 255 256 **D** : *int* 257 > Number of Random Fourier Features used to calculate Z. 258 Should be a positive integer. Higher values of D will 259 increase classification accuracy at the cost of computation 260 time. If set to `None`, will be calculated given number of 261 samples. 262 263 **remove_features** : *bool* 264 > If `True`, will remove features from X and feature_names 265 not in group_dict and remove features from groupings not in 266 feature_names. 267 268 **train_ratio** : *float* 269 > Ratio of number of training samples to entire data set. Note: 270 if a threshold is applied, the ratio training samples may 271 decrease depending on class balance and `class_threshold` 272 parameter if `allow_multiclass = True`. 273 274 **distance_metric** : *str* 275 > The pairwise distance metric used to estimate sigma. Must 276 be one of the options used in scipy.spatial.distance.cdist. 277 278 **kernel_type** : *str* 279 > The approximated kernel function used to calculate Zs. 280 Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`. 281 282 **random_state** : *int* 283 > Integer random_state used to set the seed for 284 reproducibilty. 285 286 **allow_multiclass** : *bool* 287 > If `False`, will ensure that cell labels are binary. 288 289 **class_threshold** : *str* | *int* 290 > Number of samples allowed in the training data for each cell 291 class in the training data. If `'median'`, the median number of 292 cells per cell class will be the threshold for number of 293 samples per class. 294 295 Returns 296 ------- 297 **adata** : *AnnData* 298 > *AnnData* with the following attributes and keys: 299 300 > `adata.X` : the data matrix. 301 302 > `adata.var_names` : the feature names corresponding to 303 `adata.X`. 304 305 > `adata.obs['labels']` : cell classes/phenotypes from 306 `cell_labels`. 307 308 > `adata.uns['train_indices']` : Indices for training data. 309 310 > `adata.uns['test_indices']` : Indices for testing data. 311 312 > `adata.uns['group_dict']` : Grouping information. 313 314 > `adata.uns['seed_obj']` : Seed object with seed equal to 315 100 * `random_state`. 316 317 > `with adata.uns['D']` : Number of dimensions to scMKL with. 318 319 > `adata.uns['scale_data']` : *bool* for whether or not data is log 320 transformed and scaled. 321 322 > `adata.uns['distance_metric']` : Distance metric as given. 323 324 > `adata.uns['kernel_type']` : Kernel function as given. 325 326 Examples 327 -------- 328 >>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz') 329 >>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True) 330 >>> group_dict = np.load('hallmark_genesets.pkl', 331 >>> allow_pickle = True) 332 >>> 333 >>> adata = scmkl.create_adata(X = data_mat, 334 ... feature_names = gene_names, 335 ... group_dict = group_dict) 336 >>> adata 337 AnnData object with n_obs × n_vars = 1000 × 4341 338 obs: 'labels' 339 uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 340 'distance_metric', 'train_indices', 'test_indices' 341 ''' 342 343 assert X.shape[1] == len(feature_names), ("Different number of features " 344 "in X than feature names") 345 346 if not allow_multiclass: 347 assert len(np.unique(cell_labels)) == 2, ("cell_labels must contain " 348 "2 classes") 349 if D is not None: 350 assert isinstance(D, int) and D > 0, 'D must be a positive integer' 351 352 kernel_options = ['gaussian', 'laplacian', 'cauchy'] 353 assert kernel_type.lower() in kernel_options, ("Given kernel type not " 354 "implemented. Gaussian, " 355 "Laplacian, and Cauchy " 356 "are the acceptable " 357 "types.") 358 359 X, feature_names, group_dict = _filter_features(X, 360 feature_names, 361 group_dict, 362 remove_features) 363 364 # Create adata object and add column names 365 adata = ad.AnnData(X) 366 adata.var_names = feature_names 367 368 # Add metadata to adata object 369 adata.uns['group_dict'] = group_dict 370 adata.uns['seed_obj'] = np.random.default_rng(100 * random_state) 371 adata.uns['scale_data'] = scale_data 372 adata.uns['D'] = D if D is not None else calculate_d(adata.shape[0]) 373 adata.uns['kernel_type'] = kernel_type 374 adata.uns['distance_metric'] = distance_metric 375 376 if (split_data is None): 377 assert X.shape[0] == len(cell_labels), ("Different number of cells " 378 "than labels") 379 adata.obs['labels'] = cell_labels 380 381 if (allow_multiclass == False): 382 split = _binary_split(cell_labels, 383 seed_obj = adata.uns['seed_obj'], 384 train_ratio = train_ratio) 385 train_indices, test_indices = split 386 387 elif (allow_multiclass == True): 388 split = _multi_class_split(cell_labels, 389 seed_obj = adata.uns['seed_obj'], 390 class_threshold = class_threshold, 391 train_ratio = train_ratio) 392 train_indices, test_indices = split 393 394 adata.uns['labeled_test'] = True 395 396 else: 397 x_eq_labs = X.shape[0] == len(cell_labels) 398 train_eq_labs = X.shape[0] == len(cell_labels) 399 assert x_eq_labs or train_eq_labs, ("Must give labels for all cells " 400 "or only for training cells") 401 402 train_indices = np.where(split_data == 'train')[0] 403 test_indices = np.where(split_data == 'test')[0] 404 405 if len(cell_labels) == len(train_indices): 406 407 padded_cell_labels = np.zeros((X.shape[0])).astype('object') 408 padded_cell_labels[train_indices] = cell_labels 409 padded_cell_labels[test_indices] = 'padded_test_label' 410 411 adata.obs['labels'] = padded_cell_labels 412 adata.uns['labeled_test'] = False 413 414 elif len(cell_labels) == len(split_data): 415 adata.obs['labels'] = cell_labels 416 adata.uns['labeled_test'] = True 417 418 adata.uns['train_indices'] = train_indices 419 adata.uns['test_indices'] = test_indices 420 421 if not scale_data: 422 print("WARNING: Data will not be log transformed and scaled " 423 "To change this behavior, set scale_data to True") 424 425 return adata
185def calculate_d(num_samples : int): 186 ''' 187 This function calculates the optimal number of dimensions for 188 performance. See https://doi.org/10.48550/arXiv.1806.09178 for more 189 information. 190 191 Parameters 192 ---------- 193 **num_samples** : *int* 194 > The number of samples in the data set including both training 195 and testing sets. 196 197 Returns 198 ------- 199 **d** : *int* 200 > The optimal number of dimensions to run scMKL with the given 201 data set. 202 203 Examples 204 -------- 205 >>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz') 206 >>> d = scmkl.calculate_d(raw_counts.shape[0]) 207 >>> d 208 161 209 ''' 210 d = int(np.sqrt(num_samples) * np.log(np.log(num_samples))) 211 return d
This function calculates the optimal number of dimensions for performance. See https://doi.org/10.48550/arXiv.1806.09178 for more information.
Parameters
num_samples : int
The number of samples in the data set including both training and testing sets.
Returns
d : int
The optimal number of dimensions to run scMKL with the given data set.
Examples
>>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz')
>>> d = scmkl.calculate_d(raw_counts.shape[0])
>>> d
161
214def create_adata(X, feature_names: np.ndarray, cell_labels: np.ndarray, 215 group_dict: dict, scale_data: bool = True, 216 split_data : np.ndarray | None = None, D : int | None = None, 217 remove_features = True, train_ratio = 0.8, 218 distance_metric = 'euclidean', kernel_type = 'Gaussian', 219 random_state : int = 1, allow_multiclass : bool = False, 220 class_threshold : str | int = 'median'): 221 ''' 222 Function to create an AnnData object to carry all relevant 223 information going forward. 224 225 Parameters 226 ---------- 227 **X** : *scipy.sparse.csc_matrix* | *np.ndarray* | 228 *pd.DataFrame* 229 > A data matrix of cells by features (sparse array 230 recommended for large datasets). 231 232 **feature_names** : *np.ndarray* 233 > array of feature names corresponding with the features 234 in X. 235 236 **cell_labels** : *np.ndarray* 237 > A numpy array of cell phenotypes corresponding with 238 the cells in X. 239 240 **group_dict** : *dict* 241 > Dictionary containing feature grouping information. 242 - Example: {geneset: np.array([gene_1, gene_2, ..., 243 gene_n])} 244 245 **scale_data** : *bool* 246 > If `True`, data matrix is log transformed and standard 247 scaled. 248 249 **split_data** : *None* | *np.ndarray* 250 > If *None*, data will be split stratified by cell labels. 251 Else, is an array of precalculated train/test split 252 corresponding to samples. Can include labels for entire 253 dataset to benchmark performance or for only training 254 data to classify unknown cell types. 255 - Example: np.array(['train', 'test', ..., 'train']) 256 257 **D** : *int* 258 > Number of Random Fourier Features used to calculate Z. 259 Should be a positive integer. Higher values of D will 260 increase classification accuracy at the cost of computation 261 time. If set to `None`, will be calculated given number of 262 samples. 263 264 **remove_features** : *bool* 265 > If `True`, will remove features from X and feature_names 266 not in group_dict and remove features from groupings not in 267 feature_names. 268 269 **train_ratio** : *float* 270 > Ratio of number of training samples to entire data set. Note: 271 if a threshold is applied, the ratio training samples may 272 decrease depending on class balance and `class_threshold` 273 parameter if `allow_multiclass = True`. 274 275 **distance_metric** : *str* 276 > The pairwise distance metric used to estimate sigma. Must 277 be one of the options used in scipy.spatial.distance.cdist. 278 279 **kernel_type** : *str* 280 > The approximated kernel function used to calculate Zs. 281 Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`. 282 283 **random_state** : *int* 284 > Integer random_state used to set the seed for 285 reproducibilty. 286 287 **allow_multiclass** : *bool* 288 > If `False`, will ensure that cell labels are binary. 289 290 **class_threshold** : *str* | *int* 291 > Number of samples allowed in the training data for each cell 292 class in the training data. If `'median'`, the median number of 293 cells per cell class will be the threshold for number of 294 samples per class. 295 296 Returns 297 ------- 298 **adata** : *AnnData* 299 > *AnnData* with the following attributes and keys: 300 301 > `adata.X` : the data matrix. 302 303 > `adata.var_names` : the feature names corresponding to 304 `adata.X`. 305 306 > `adata.obs['labels']` : cell classes/phenotypes from 307 `cell_labels`. 308 309 > `adata.uns['train_indices']` : Indices for training data. 310 311 > `adata.uns['test_indices']` : Indices for testing data. 312 313 > `adata.uns['group_dict']` : Grouping information. 314 315 > `adata.uns['seed_obj']` : Seed object with seed equal to 316 100 * `random_state`. 317 318 > `with adata.uns['D']` : Number of dimensions to scMKL with. 319 320 > `adata.uns['scale_data']` : *bool* for whether or not data is log 321 transformed and scaled. 322 323 > `adata.uns['distance_metric']` : Distance metric as given. 324 325 > `adata.uns['kernel_type']` : Kernel function as given. 326 327 Examples 328 -------- 329 >>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz') 330 >>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True) 331 >>> group_dict = np.load('hallmark_genesets.pkl', 332 >>> allow_pickle = True) 333 >>> 334 >>> adata = scmkl.create_adata(X = data_mat, 335 ... feature_names = gene_names, 336 ... group_dict = group_dict) 337 >>> adata 338 AnnData object with n_obs × n_vars = 1000 × 4341 339 obs: 'labels' 340 uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 341 'distance_metric', 'train_indices', 'test_indices' 342 ''' 343 344 assert X.shape[1] == len(feature_names), ("Different number of features " 345 "in X than feature names") 346 347 if not allow_multiclass: 348 assert len(np.unique(cell_labels)) == 2, ("cell_labels must contain " 349 "2 classes") 350 if D is not None: 351 assert isinstance(D, int) and D > 0, 'D must be a positive integer' 352 353 kernel_options = ['gaussian', 'laplacian', 'cauchy'] 354 assert kernel_type.lower() in kernel_options, ("Given kernel type not " 355 "implemented. Gaussian, " 356 "Laplacian, and Cauchy " 357 "are the acceptable " 358 "types.") 359 360 X, feature_names, group_dict = _filter_features(X, 361 feature_names, 362 group_dict, 363 remove_features) 364 365 # Create adata object and add column names 366 adata = ad.AnnData(X) 367 adata.var_names = feature_names 368 369 # Add metadata to adata object 370 adata.uns['group_dict'] = group_dict 371 adata.uns['seed_obj'] = np.random.default_rng(100 * random_state) 372 adata.uns['scale_data'] = scale_data 373 adata.uns['D'] = D if D is not None else calculate_d(adata.shape[0]) 374 adata.uns['kernel_type'] = kernel_type 375 adata.uns['distance_metric'] = distance_metric 376 377 if (split_data is None): 378 assert X.shape[0] == len(cell_labels), ("Different number of cells " 379 "than labels") 380 adata.obs['labels'] = cell_labels 381 382 if (allow_multiclass == False): 383 split = _binary_split(cell_labels, 384 seed_obj = adata.uns['seed_obj'], 385 train_ratio = train_ratio) 386 train_indices, test_indices = split 387 388 elif (allow_multiclass == True): 389 split = _multi_class_split(cell_labels, 390 seed_obj = adata.uns['seed_obj'], 391 class_threshold = class_threshold, 392 train_ratio = train_ratio) 393 train_indices, test_indices = split 394 395 adata.uns['labeled_test'] = True 396 397 else: 398 x_eq_labs = X.shape[0] == len(cell_labels) 399 train_eq_labs = X.shape[0] == len(cell_labels) 400 assert x_eq_labs or train_eq_labs, ("Must give labels for all cells " 401 "or only for training cells") 402 403 train_indices = np.where(split_data == 'train')[0] 404 test_indices = np.where(split_data == 'test')[0] 405 406 if len(cell_labels) == len(train_indices): 407 408 padded_cell_labels = np.zeros((X.shape[0])).astype('object') 409 padded_cell_labels[train_indices] = cell_labels 410 padded_cell_labels[test_indices] = 'padded_test_label' 411 412 adata.obs['labels'] = padded_cell_labels 413 adata.uns['labeled_test'] = False 414 415 elif len(cell_labels) == len(split_data): 416 adata.obs['labels'] = cell_labels 417 adata.uns['labeled_test'] = True 418 419 adata.uns['train_indices'] = train_indices 420 adata.uns['test_indices'] = test_indices 421 422 if not scale_data: 423 print("WARNING: Data will not be log transformed and scaled " 424 "To change this behavior, set scale_data to True") 425 426 return adata
Function to create an AnnData object to carry all relevant information going forward.
Parameters
X : scipy.sparse.csc_matrix | np.ndarray | pd.DataFrame
A data matrix of cells by features (sparse array recommended for large datasets).
feature_names : np.ndarray
array of feature names corresponding with the features in X.
cell_labels : np.ndarray
A numpy array of cell phenotypes corresponding with the cells in X.
group_dict : dict
Dictionary containing feature grouping information. - Example: {geneset: np.array([gene_1, gene_2, ..., gene_n])}
scale_data : bool
If
True
, data matrix is log transformed and standard scaled.
split_data : None | np.ndarray
If None, data will be split stratified by cell labels. Else, is an array of precalculated train/test split corresponding to samples. Can include labels for entire dataset to benchmark performance or for only training data to classify unknown cell types. - Example: np.array(['train', 'test', ..., 'train'])
D : int
Number of Random Fourier Features used to calculate Z. Should be a positive integer. Higher values of D will increase classification accuracy at the cost of computation time. If set to
None
, will be calculated given number of samples.
remove_features : bool
If
True
, will remove features from X and feature_names not in group_dict and remove features from groupings not in feature_names.
train_ratio : float
Ratio of number of training samples to entire data set. Note: if a threshold is applied, the ratio training samples may decrease depending on class balance and
class_threshold
parameter ifallow_multiclass = True
.
distance_metric : str
The pairwise distance metric used to estimate sigma. Must be one of the options used in scipy.spatial.distance.cdist.
kernel_type : str
The approximated kernel function used to calculate Zs. Must be one of
'Gaussian'
,'Laplacian'
, or'Cauchy'
.
random_state : int
Integer random_state used to set the seed for reproducibilty.
allow_multiclass : bool
If
False
, will ensure that cell labels are binary.
class_threshold : str | int
Number of samples allowed in the training data for each cell class in the training data. If
'median'
, the median number of cells per cell class will be the threshold for number of samples per class.
Returns
adata : AnnData
AnnData with the following attributes and keys:
adata.X
: the data matrix.
adata.var_names
: the feature names corresponding toadata.X
.
adata.obs['labels']
: cell classes/phenotypes fromcell_labels
.
adata.uns['train_indices']
: Indices for training data.
adata.uns['test_indices']
: Indices for testing data.
adata.uns['group_dict']
: Grouping information.
adata.uns['seed_obj']
: Seed object with seed equal to 100 *random_state
.
with adata.uns['D']
: Number of dimensions to scMKL with.
adata.uns['scale_data']
: bool for whether or not data is log transformed and scaled.
adata.uns['distance_metric']
: Distance metric as given.
adata.uns['kernel_type']
: Kernel function as given.
Examples
>>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz')
>>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True)
>>> group_dict = np.load('hallmark_genesets.pkl',
>>> allow_pickle = True)
>>>
>>> adata = scmkl.create_adata(X = data_mat,
... feature_names = gene_names,
... group_dict = group_dict)
>>> adata
AnnData object with n_obs × n_vars = 1000 × 4341
obs: 'labels'
uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type',
'distance_metric', 'train_indices', 'test_indices'