scmkl.create_adata
1import numpy as np 2import anndata as ad 3 4 5def _filter_features(X, feature_names, group_dict, remove_features): 6 ''' 7 Function to remove unused features from X matrix. Any features not 8 included in group_dict will be removed from the matrix. Also puts 9 the features in the same relative order (of included features) 10 11 Parameters 12 ---------- 13 X : Data array. Can be Numpy array or Scipy Sparse Array 14 feature_names : Numpy array of corresponding feature names 15 group_dict : Dictionary containing feature grouping information. 16 Example: {geneset: np.array(gene_1, gene_2, ..., 17 gene_n)} 18 Returns 19 ------- 20 X : Data array containing data only for features in the group_dict 21 feature_names : Numpy array of corresponding feature names from 22 group_dict 23 ''' 24 assert X.shape[1] == len(feature_names), ("Given features do not " 25 "correspond with features in X") 26 27 group_features = set() 28 feature_set = set(feature_names) 29 30 # Store all objects in dictionary in set 31 for group in group_dict.keys(): 32 group_features.update(set(group_dict[group])) 33 34 # Finds intersection between group features and features in data 35 # Converts to nd.array and sorts to preserve order of feature names 36 group_feats = list(feature_set.intersection(set(group_dict[group]))) 37 group_dict[group] = np.sort(np.array(group_feats)) 38 39 # Only keeping groupings that have at least two features 40 group_dict = {group : group_dict[group] for group in group_dict.keys() 41 if len(group_dict[group]) > 1} 42 43 if remove_features: 44 # Find location of desired features in whole feature set 45 g_features = np.array(list(group_features)) 46 group_feature_indices = np.where(np.in1d(feature_names, 47 g_features, 48 assume_unique = True))[0] 49 50 # Subset only the desired features and data 51 X = X[:,group_feature_indices] 52 feature_names = np.array(list(feature_names))[group_feature_indices] 53 54 return X, feature_names, group_dict 55 56 57def _multi_class_split(y, train_ratio = 0.8, class_threshold = 'median', 58 seed_obj = np.random.default_rng(100)): 59 ''' 60 Function for calculating the training and testing cell positions 61 for multiclass data sets. 62 63 Parameters 64 ---------- 65 **y** : *np.ndarray* | *pd.Series* | *list* 66 > Should be an iterable object cooresponding to samples in 67 `ad.AnnData` object. 68 69 **seed_obj** : *numpy.random._generator.Generator* 70 > Seed used to randomly sample and split data. 71 72 **train_ratio** : *float* 73 > Ratio of number of training samples to entire data set. 74 Note: if a threshold is applied, the ratio training samples 75 may decrease depending on class balance and `class_threshold` 76 parameter. 77 78 **class_threshold** : *str* | *int* 79 > If is type `int`, classes with more samples than 80 class_threshold will be sampled. If `'median'`, 81 samples will be sampled to the median number of samples per 82 class. 83 84 Returns 85 ------- 86 **train_indices** : *np.ndarray* 87 > Indices for training samples. 88 89 **test_indices** : *np.ndarray* 90 > Indices for testing samples. 91 ''' 92 uniq_labels = np.unique(y) 93 94 # Finding indices for each cell class 95 class_positions = {class_ : np.where(y == class_)[0] 96 for class_ in uniq_labels} 97 98 # Capturing training indices while maintaining original class proportions 99 train_samples = {class_ : seed_obj.choice(class_positions[class_], 100 int(len(class_positions[class_]) 101 * train_ratio), 102 replace = False) 103 for class_ in class_positions.keys()} 104 105 # Capturing testing indices while maintaining original class proportions 106 test_samples = {class_ : np.setdiff1d(class_positions[class_], 107 train_samples[class_]) 108 for class_ in class_positions.keys()} 109 110 # Applying threshold for samples per class 111 if class_threshold == 'median': 112 all_train = [idx for class_ in train_samples.keys() 113 for idx in train_samples[class_]] 114 _, class_threshold = np.unique(y[all_train], return_counts = True) 115 class_threshold = int(np.median(class_threshold)) 116 117 for class_ in train_samples.keys(): 118 if len(train_samples[class_]) > class_threshold: 119 train_samples[class_] = seed_obj.choice(train_samples[class_], 120 class_threshold) 121 122 train_indices = np.array([idx for class_ in train_samples.keys() 123 for idx in train_samples[class_]]) 124 125 test_indices = np.array([idx for class_ in test_samples.keys() 126 for idx in test_samples[class_]]) 127 128 return train_indices, test_indices 129 130 131def _binary_split(y, train_indices = None, train_ratio = 0.8, 132 seed_obj = np.random.default_rng(100)): 133 ''' 134 Function to calculate training and testing indices for given 135 dataset. If train indices are given, it will calculate the test 136 indices. If train_indices == None, then it calculates both indices, 137 preserving the ratio of each label in y 138 139 Parameters 140 ---------- 141 y : Numpy array of cell labels. Can have any number of classes for 142 this function. 143 train_indices : Optional array of pre-determined training indices 144 seed_obj : Numpy random state used for random processes. Can be 145 specified for reproducubility or set by default. 146 train_ratio : decimal value ratio of features in training/testing 147 sets 148 149 Returns 150 ------- 151 train_indices : Array of indices of training cells 152 test_indices : Array of indices of testing cells 153 ''' 154 155 # If train indices aren't provided 156 if train_indices is None: 157 158 unique_labels = np.unique(y) 159 train_indices = [] 160 161 for label in unique_labels: 162 163 # Find index of each unique label 164 label_indices = np.where(y == label)[0] 165 166 # Sample these indices according to train ratio 167 n = int(len(label_indices) * train_ratio) 168 train_label_indices = seed_obj.choice(label_indices, n, 169 replace = False) 170 train_indices.extend(train_label_indices) 171 else: 172 assert len(train_indices) <= len(y), ("More train indices than there " 173 "are samples") 174 175 train_indices = np.array(train_indices) 176 177 # Test indices are the indices not in the train_indices 178 test_indices = np.setdiff1d(np.arange(len(y)), train_indices, 179 assume_unique = True) 180 181 return train_indices, test_indices 182 183 184def calculate_d(num_samples : int): 185 ''' 186 This function calculates the optimal number of dimensions for 187 performance. See https://doi.org/10.48550/arXiv.1806.09178 for more 188 information. 189 190 Parameters 191 ---------- 192 **num_samples** : *int* 193 > The number of samples in the data set including both training 194 and testing sets. 195 196 Returns 197 ------- 198 **d** : *int* 199 > The optimal number of dimensions to run scMKL with the given 200 data set. 201 202 Examples 203 -------- 204 >>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz') 205 >>> d = scmkl.calculate_d(raw_counts.shape[0]) 206 >>> d 207 161 208 ''' 209 d = int(np.sqrt(num_samples) * np.log(np.log(num_samples))) 210 return d 211 212 213def create_adata(X, feature_names: np.ndarray, cell_labels: np.ndarray, 214 group_dict: dict, scale_data: bool = True, 215 split_data : np.ndarray | None = None, D : int | None = None, 216 remove_features = True, train_ratio = 0.8, 217 distance_metric = 'euclidean', kernel_type = 'Gaussian', 218 random_state : int = 1, allow_multiclass : bool = False, 219 class_threshold : str | int = 'median', 220 reduction: str | None = None, tfidf: bool = False): 221 ''' 222 Function to create an AnnData object to carry all relevant 223 information going forward. 224 225 Parameters 226 ---------- 227 **X** : *scipy.sparse.csc_matrix* | *np.ndarray* | 228 *pd.DataFrame* 229 > A data matrix of cells by features (sparse array 230 recommended for large datasets). 231 232 **feature_names** : *np.ndarray* 233 > array of feature names corresponding with the features 234 in X. 235 236 **cell_labels** : *np.ndarray* 237 > A numpy array of cell phenotypes corresponding with 238 the cells in X. 239 240 **group_dict** : *dict* 241 > Dictionary containing feature grouping information. 242 - Example: {geneset: np.array([gene_1, gene_2, ..., 243 gene_n])} 244 245 **scale_data** : *bool* 246 > If `True`, data matrix is log transformed and standard 247 scaled. 248 249 **split_data** : *None* | *np.ndarray* 250 > If *None*, data will be split stratified by cell labels. 251 Else, is an array of precalculated train/test split 252 corresponding to samples. Can include labels for entire 253 dataset to benchmark performance or for only training 254 data to classify unknown cell types. 255 - Example: np.array(['train', 'test', ..., 'train']) 256 257 **D** : *int* 258 > Number of Random Fourier Features used to calculate Z. 259 Should be a positive integer. Higher values of D will 260 increase classification accuracy at the cost of computation 261 time. If set to `None`, will be calculated given number of 262 samples. 263 264 **remove_features** : *bool* 265 > If `True`, will remove features from X and feature_names 266 not in group_dict and remove features from groupings not in 267 feature_names. 268 269 **train_ratio** : *float* 270 > Ratio of number of training samples to entire data set. Note: 271 if a threshold is applied, the ratio training samples may 272 decrease depending on class balance and `class_threshold` 273 parameter if `allow_multiclass = True`. 274 275 **distance_metric** : *str* 276 > The pairwise distance metric used to estimate sigma. Must 277 be one of the options used in scipy.spatial.distance.cdist. 278 279 **kernel_type** : *str* 280 > The approximated kernel function used to calculate Zs. 281 Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`. 282 283 **random_state** : *int* 284 > Integer random_state used to set the seed for 285 reproducibilty. 286 287 **allow_multiclass** : *bool* 288 > If `False`, will ensure that cell labels are binary. 289 290 **class_threshold** : *str* | *int* 291 > Number of samples allowed in the training data for each cell 292 class in the training data. If `'median'`, the median number of 293 cells per cell class will be the threshold for number of 294 samples per class. 295 296 **reduction**: *str* | *None* 297 > Choose which dimension reduction technique to perform on features 298 within a group. 'svd' will run sklearn.decomposition.TruncatedSVD, 299 'linear' will multiply by an array of 1s down to 50 dimensions. 300 301 **tfidf**: *bool* 302 > Whether to calculate TFIDF transformation on peaks within 303 groupings. 304 305 Returns 306 ------- 307 **adata** : *AnnData* 308 > *AnnData* with the following attributes and keys: 309 310 > `adata.X` : the data matrix. 311 312 > `adata.var_names` : the feature names corresponding to 313 `adata.X`. 314 315 > `adata.obs['labels']` : cell classes/phenotypes from 316 `cell_labels`. 317 318 > `adata.uns['train_indices']` : Indices for training data. 319 320 > `adata.uns['test_indices']` : Indices for testing data. 321 322 > `adata.uns['group_dict']` : Grouping information. 323 324 > `adata.uns['seed_obj']` : Seed object with seed equal to 325 100 * `random_state`. 326 327 > `with adata.uns['D']` : Number of dimensions to scMKL with. 328 329 > `adata.uns['scale_data']` : *bool* for whether or not data is log 330 transformed and scaled. 331 332 > `adata.uns['distance_metric']` : Distance metric as given. 333 334 > `adata.uns['kernel_type']` : Kernel function as given. 335 336 > `adata.uns['svd'] : *bool* for whether to calculate svd reduction. 337 338 > `adata.uns['tfidf'] : *bool* for whether to calculate tfidf per grouping. 339 340 Examples 341 -------- 342 >>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz') 343 >>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True) 344 >>> group_dict = np.load('hallmark_genesets.pkl', 345 >>> allow_pickle = True) 346 >>> 347 >>> adata = scmkl.create_adata(X = data_mat, 348 ... feature_names = gene_names, 349 ... group_dict = group_dict) 350 >>> adata 351 AnnData object with n_obs × n_vars = 1000 × 4341 352 obs: 'labels' 353 uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 354 'distance_metric', 'train_indices', 'test_indices' 355 ''' 356 357 assert X.shape[1] == len(feature_names), ("Different number of features " 358 "in X than feature names") 359 360 if not allow_multiclass: 361 assert len(np.unique(cell_labels)) == 2, ("cell_labels must contain " 362 "2 classes") 363 if D is not None: 364 assert isinstance(D, int) and D > 0, 'D must be a positive integer' 365 366 kernel_options = ['gaussian', 'laplacian', 'cauchy'] 367 assert kernel_type.lower() in kernel_options, ("Given kernel type not " 368 "implemented. Gaussian, " 369 "Laplacian, and Cauchy " 370 "are the acceptable " 371 "types.") 372 373 X, feature_names, group_dict = _filter_features(X, 374 feature_names, 375 group_dict, 376 remove_features) 377 378 # Create adata object and add column names 379 adata = ad.AnnData(X) 380 adata.var_names = feature_names 381 382 # Add metadata to adata object 383 adata.uns['group_dict'] = group_dict 384 adata.uns['seed_obj'] = np.random.default_rng(100 * random_state) 385 adata.uns['scale_data'] = scale_data 386 adata.uns['D'] = D if D is not None else calculate_d(adata.shape[0]) 387 adata.uns['kernel_type'] = kernel_type 388 adata.uns['distance_metric'] = distance_metric 389 adata.uns['reduction'] = reduction if isinstance(reduction, str) else 'None' 390 adata.uns['tfidf'] = tfidf 391 392 if (split_data is None): 393 assert X.shape[0] == len(cell_labels), ("Different number of cells " 394 "than labels") 395 adata.obs['labels'] = cell_labels 396 397 if (allow_multiclass == False): 398 split = _binary_split(cell_labels, 399 seed_obj = adata.uns['seed_obj'], 400 train_ratio = train_ratio) 401 train_indices, test_indices = split 402 403 elif (allow_multiclass == True): 404 split = _multi_class_split(cell_labels, 405 seed_obj = adata.uns['seed_obj'], 406 class_threshold = class_threshold, 407 train_ratio = train_ratio) 408 train_indices, test_indices = split 409 410 adata.uns['labeled_test'] = True 411 412 else: 413 x_eq_labs = X.shape[0] == len(cell_labels) 414 train_eq_labs = X.shape[0] == len(cell_labels) 415 assert x_eq_labs or train_eq_labs, ("Must give labels for all cells " 416 "or only for training cells") 417 418 train_indices = np.where(split_data == 'train')[0] 419 test_indices = np.where(split_data == 'test')[0] 420 421 if len(cell_labels) == len(train_indices): 422 423 padded_cell_labels = np.zeros((X.shape[0])).astype('object') 424 padded_cell_labels[train_indices] = cell_labels 425 padded_cell_labels[test_indices] = 'padded_test_label' 426 427 adata.obs['labels'] = padded_cell_labels 428 adata.uns['labeled_test'] = False 429 430 elif len(cell_labels) == len(split_data): 431 adata.obs['labels'] = cell_labels 432 adata.uns['labeled_test'] = True 433 434 adata.uns['train_indices'] = train_indices 435 adata.uns['test_indices'] = test_indices 436 437 if not scale_data: 438 print("WARNING: Data will not be log transformed and scaled " 439 "To change this behavior, set scale_data to True") 440 441 return adata
185def calculate_d(num_samples : int): 186 ''' 187 This function calculates the optimal number of dimensions for 188 performance. See https://doi.org/10.48550/arXiv.1806.09178 for more 189 information. 190 191 Parameters 192 ---------- 193 **num_samples** : *int* 194 > The number of samples in the data set including both training 195 and testing sets. 196 197 Returns 198 ------- 199 **d** : *int* 200 > The optimal number of dimensions to run scMKL with the given 201 data set. 202 203 Examples 204 -------- 205 >>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz') 206 >>> d = scmkl.calculate_d(raw_counts.shape[0]) 207 >>> d 208 161 209 ''' 210 d = int(np.sqrt(num_samples) * np.log(np.log(num_samples))) 211 return d
This function calculates the optimal number of dimensions for performance. See https://doi.org/10.48550/arXiv.1806.09178 for more information.
Parameters
num_samples : int
The number of samples in the data set including both training and testing sets.
Returns
d : int
The optimal number of dimensions to run scMKL with the given data set.
Examples
>>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz')
>>> d = scmkl.calculate_d(raw_counts.shape[0])
>>> d
161
214def create_adata(X, feature_names: np.ndarray, cell_labels: np.ndarray, 215 group_dict: dict, scale_data: bool = True, 216 split_data : np.ndarray | None = None, D : int | None = None, 217 remove_features = True, train_ratio = 0.8, 218 distance_metric = 'euclidean', kernel_type = 'Gaussian', 219 random_state : int = 1, allow_multiclass : bool = False, 220 class_threshold : str | int = 'median', 221 reduction: str | None = None, tfidf: bool = False): 222 ''' 223 Function to create an AnnData object to carry all relevant 224 information going forward. 225 226 Parameters 227 ---------- 228 **X** : *scipy.sparse.csc_matrix* | *np.ndarray* | 229 *pd.DataFrame* 230 > A data matrix of cells by features (sparse array 231 recommended for large datasets). 232 233 **feature_names** : *np.ndarray* 234 > array of feature names corresponding with the features 235 in X. 236 237 **cell_labels** : *np.ndarray* 238 > A numpy array of cell phenotypes corresponding with 239 the cells in X. 240 241 **group_dict** : *dict* 242 > Dictionary containing feature grouping information. 243 - Example: {geneset: np.array([gene_1, gene_2, ..., 244 gene_n])} 245 246 **scale_data** : *bool* 247 > If `True`, data matrix is log transformed and standard 248 scaled. 249 250 **split_data** : *None* | *np.ndarray* 251 > If *None*, data will be split stratified by cell labels. 252 Else, is an array of precalculated train/test split 253 corresponding to samples. Can include labels for entire 254 dataset to benchmark performance or for only training 255 data to classify unknown cell types. 256 - Example: np.array(['train', 'test', ..., 'train']) 257 258 **D** : *int* 259 > Number of Random Fourier Features used to calculate Z. 260 Should be a positive integer. Higher values of D will 261 increase classification accuracy at the cost of computation 262 time. If set to `None`, will be calculated given number of 263 samples. 264 265 **remove_features** : *bool* 266 > If `True`, will remove features from X and feature_names 267 not in group_dict and remove features from groupings not in 268 feature_names. 269 270 **train_ratio** : *float* 271 > Ratio of number of training samples to entire data set. Note: 272 if a threshold is applied, the ratio training samples may 273 decrease depending on class balance and `class_threshold` 274 parameter if `allow_multiclass = True`. 275 276 **distance_metric** : *str* 277 > The pairwise distance metric used to estimate sigma. Must 278 be one of the options used in scipy.spatial.distance.cdist. 279 280 **kernel_type** : *str* 281 > The approximated kernel function used to calculate Zs. 282 Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`. 283 284 **random_state** : *int* 285 > Integer random_state used to set the seed for 286 reproducibilty. 287 288 **allow_multiclass** : *bool* 289 > If `False`, will ensure that cell labels are binary. 290 291 **class_threshold** : *str* | *int* 292 > Number of samples allowed in the training data for each cell 293 class in the training data. If `'median'`, the median number of 294 cells per cell class will be the threshold for number of 295 samples per class. 296 297 **reduction**: *str* | *None* 298 > Choose which dimension reduction technique to perform on features 299 within a group. 'svd' will run sklearn.decomposition.TruncatedSVD, 300 'linear' will multiply by an array of 1s down to 50 dimensions. 301 302 **tfidf**: *bool* 303 > Whether to calculate TFIDF transformation on peaks within 304 groupings. 305 306 Returns 307 ------- 308 **adata** : *AnnData* 309 > *AnnData* with the following attributes and keys: 310 311 > `adata.X` : the data matrix. 312 313 > `adata.var_names` : the feature names corresponding to 314 `adata.X`. 315 316 > `adata.obs['labels']` : cell classes/phenotypes from 317 `cell_labels`. 318 319 > `adata.uns['train_indices']` : Indices for training data. 320 321 > `adata.uns['test_indices']` : Indices for testing data. 322 323 > `adata.uns['group_dict']` : Grouping information. 324 325 > `adata.uns['seed_obj']` : Seed object with seed equal to 326 100 * `random_state`. 327 328 > `with adata.uns['D']` : Number of dimensions to scMKL with. 329 330 > `adata.uns['scale_data']` : *bool* for whether or not data is log 331 transformed and scaled. 332 333 > `adata.uns['distance_metric']` : Distance metric as given. 334 335 > `adata.uns['kernel_type']` : Kernel function as given. 336 337 > `adata.uns['svd'] : *bool* for whether to calculate svd reduction. 338 339 > `adata.uns['tfidf'] : *bool* for whether to calculate tfidf per grouping. 340 341 Examples 342 -------- 343 >>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz') 344 >>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True) 345 >>> group_dict = np.load('hallmark_genesets.pkl', 346 >>> allow_pickle = True) 347 >>> 348 >>> adata = scmkl.create_adata(X = data_mat, 349 ... feature_names = gene_names, 350 ... group_dict = group_dict) 351 >>> adata 352 AnnData object with n_obs × n_vars = 1000 × 4341 353 obs: 'labels' 354 uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 355 'distance_metric', 'train_indices', 'test_indices' 356 ''' 357 358 assert X.shape[1] == len(feature_names), ("Different number of features " 359 "in X than feature names") 360 361 if not allow_multiclass: 362 assert len(np.unique(cell_labels)) == 2, ("cell_labels must contain " 363 "2 classes") 364 if D is not None: 365 assert isinstance(D, int) and D > 0, 'D must be a positive integer' 366 367 kernel_options = ['gaussian', 'laplacian', 'cauchy'] 368 assert kernel_type.lower() in kernel_options, ("Given kernel type not " 369 "implemented. Gaussian, " 370 "Laplacian, and Cauchy " 371 "are the acceptable " 372 "types.") 373 374 X, feature_names, group_dict = _filter_features(X, 375 feature_names, 376 group_dict, 377 remove_features) 378 379 # Create adata object and add column names 380 adata = ad.AnnData(X) 381 adata.var_names = feature_names 382 383 # Add metadata to adata object 384 adata.uns['group_dict'] = group_dict 385 adata.uns['seed_obj'] = np.random.default_rng(100 * random_state) 386 adata.uns['scale_data'] = scale_data 387 adata.uns['D'] = D if D is not None else calculate_d(adata.shape[0]) 388 adata.uns['kernel_type'] = kernel_type 389 adata.uns['distance_metric'] = distance_metric 390 adata.uns['reduction'] = reduction if isinstance(reduction, str) else 'None' 391 adata.uns['tfidf'] = tfidf 392 393 if (split_data is None): 394 assert X.shape[0] == len(cell_labels), ("Different number of cells " 395 "than labels") 396 adata.obs['labels'] = cell_labels 397 398 if (allow_multiclass == False): 399 split = _binary_split(cell_labels, 400 seed_obj = adata.uns['seed_obj'], 401 train_ratio = train_ratio) 402 train_indices, test_indices = split 403 404 elif (allow_multiclass == True): 405 split = _multi_class_split(cell_labels, 406 seed_obj = adata.uns['seed_obj'], 407 class_threshold = class_threshold, 408 train_ratio = train_ratio) 409 train_indices, test_indices = split 410 411 adata.uns['labeled_test'] = True 412 413 else: 414 x_eq_labs = X.shape[0] == len(cell_labels) 415 train_eq_labs = X.shape[0] == len(cell_labels) 416 assert x_eq_labs or train_eq_labs, ("Must give labels for all cells " 417 "or only for training cells") 418 419 train_indices = np.where(split_data == 'train')[0] 420 test_indices = np.where(split_data == 'test')[0] 421 422 if len(cell_labels) == len(train_indices): 423 424 padded_cell_labels = np.zeros((X.shape[0])).astype('object') 425 padded_cell_labels[train_indices] = cell_labels 426 padded_cell_labels[test_indices] = 'padded_test_label' 427 428 adata.obs['labels'] = padded_cell_labels 429 adata.uns['labeled_test'] = False 430 431 elif len(cell_labels) == len(split_data): 432 adata.obs['labels'] = cell_labels 433 adata.uns['labeled_test'] = True 434 435 adata.uns['train_indices'] = train_indices 436 adata.uns['test_indices'] = test_indices 437 438 if not scale_data: 439 print("WARNING: Data will not be log transformed and scaled " 440 "To change this behavior, set scale_data to True") 441 442 return adata
Function to create an AnnData object to carry all relevant information going forward.
Parameters
X : scipy.sparse.csc_matrix | np.ndarray | pd.DataFrame
A data matrix of cells by features (sparse array recommended for large datasets).
feature_names : np.ndarray
array of feature names corresponding with the features in X.
cell_labels : np.ndarray
A numpy array of cell phenotypes corresponding with the cells in X.
group_dict : dict
Dictionary containing feature grouping information. - Example: {geneset: np.array([gene_1, gene_2, ..., gene_n])}
scale_data : bool
If
True
, data matrix is log transformed and standard scaled.
split_data : None | np.ndarray
If None, data will be split stratified by cell labels. Else, is an array of precalculated train/test split corresponding to samples. Can include labels for entire dataset to benchmark performance or for only training data to classify unknown cell types. - Example: np.array(['train', 'test', ..., 'train'])
D : int
Number of Random Fourier Features used to calculate Z. Should be a positive integer. Higher values of D will increase classification accuracy at the cost of computation time. If set to
None
, will be calculated given number of samples.
remove_features : bool
If
True
, will remove features from X and feature_names not in group_dict and remove features from groupings not in feature_names.
train_ratio : float
Ratio of number of training samples to entire data set. Note: if a threshold is applied, the ratio training samples may decrease depending on class balance and
class_threshold
parameter ifallow_multiclass = True
.
distance_metric : str
The pairwise distance metric used to estimate sigma. Must be one of the options used in scipy.spatial.distance.cdist.
kernel_type : str
The approximated kernel function used to calculate Zs. Must be one of
'Gaussian'
,'Laplacian'
, or'Cauchy'
.
random_state : int
Integer random_state used to set the seed for reproducibilty.
allow_multiclass : bool
If
False
, will ensure that cell labels are binary.
class_threshold : str | int
Number of samples allowed in the training data for each cell class in the training data. If
'median'
, the median number of cells per cell class will be the threshold for number of samples per class.
reduction: str | None
Choose which dimension reduction technique to perform on features within a group. 'svd' will run sklearn.decomposition.TruncatedSVD, 'linear' will multiply by an array of 1s down to 50 dimensions.
tfidf: bool
Whether to calculate TFIDF transformation on peaks within groupings.
Returns
adata : AnnData
AnnData with the following attributes and keys:
adata.X
: the data matrix.
adata.var_names
: the feature names corresponding toadata.X
.
adata.obs['labels']
: cell classes/phenotypes fromcell_labels
.
adata.uns['train_indices']
: Indices for training data.
adata.uns['test_indices']
: Indices for testing data.
adata.uns['group_dict']
: Grouping information.
adata.uns['seed_obj']
: Seed object with seed equal to 100 *random_state
.
with adata.uns['D']
: Number of dimensions to scMKL with.
adata.uns['scale_data']
: bool for whether or not data is log transformed and scaled.
adata.uns['distance_metric']
: Distance metric as given.
adata.uns['kernel_type']
: Kernel function as given.
`adata.uns['svd'] : bool for whether to calculate svd reduction.
`adata.uns['tfidf'] : bool for whether to calculate tfidf per grouping.
Examples
>>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz')
>>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True)
>>> group_dict = np.load('hallmark_genesets.pkl',
>>> allow_pickle = True)
>>>
>>> adata = scmkl.create_adata(X = data_mat,
... feature_names = gene_names,
... group_dict = group_dict)
>>> adata
AnnData object with n_obs × n_vars = 1000 × 4341
obs: 'labels'
uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type',
'distance_metric', 'train_indices', 'test_indices'