scmkl.create_adata
1import numpy as np 2import anndata as ad 3import scipy 4import pandas as pd 5import gc 6import warnings 7 8 9def _filter_features(feature_names: np.ndarray, group_dict: dict): 10 """ 11 Function to remove features only in feature names or group_dict. 12 Any features not included in group_dict will be removed from the 13 matrix. Also puts the features in the same relative order (of 14 included features) 15 16 Parameters 17 ---------- 18 feature_names : np.ndarray 19 Numpy array of corresponding feature names. 20 21 group_dict : dict 22 Dictionary containing feature grouping information. 23 Example: {geneset: np.array(gene_1, gene_2, ..., 24 gene_n)} 25 Returns 26 ------- 27 feature_names : np.ndarray 28 Numpy array of corresponding feature names from group_dict. 29 30 group_dict : dict 31 Dictionary containing features overlapping input grouping 32 information and full feature names. 33 """ 34 group_features = set() 35 feature_set = set(feature_names) 36 37 # Store all objects in dictionary in set 38 for group in group_dict.keys(): 39 group_features.update(set(group_dict[group])) 40 41 # Finds intersection between group features and features in data 42 # Converts to nd.array and sorts to preserve order of feature names 43 group_feats = list(feature_set.intersection(set(group_dict[group]))) 44 group_dict[group] = np.sort(np.array(group_feats)) 45 46 # Only keeping groupings that have at least two features 47 group_dict = {group : group_dict[group] for group in group_dict.keys() 48 if len(group_dict[group]) > 1} 49 50 group_features = np.array(list(group_features.intersection(feature_set))) 51 52 return group_features, group_dict 53 54 55def _multi_class_split(y: np.ndarray, train_ratio: float=0.8, class_threshold: str | int='median', 56 seed_obj: np.random._generator.Generator=np.random.default_rng(100)): 57 """ 58 Function for calculating the training and testing cell positions 59 for multiclass data sets. 60 61 Parameters 62 ---------- 63 y : array_like 64 Should be an iterable object cooresponding to samples in 65 `ad.AnnData` object. 66 67 seed_obj : np.random._generator.Generator 68 Seed used to randomly sample and split data. 69 70 train_ratio : float 71 Ratio of number of training samples to entire data set. 72 Note: if a threshold is applied, the ratio training samples 73 may decrease depending on class balance and `class_threshold` 74 parameter. 75 76 class_threshold : str | int 77 If is type `int`, classes with more samples than 78 class_threshold will be sampled. If `'median'`, 79 samples will be sampled to the median number of samples per 80 class. 81 82 Returns 83 ------- 84 train_indices : np.ndarray 85 Indices for training samples. 86 87 test_indices : np.ndarray 88 Indices for testing samples. 89 """ 90 uniq_labels = np.unique(y) 91 92 # Finding indices for each cell class 93 class_positions = {class_ : np.where(y == class_)[0] 94 for class_ in uniq_labels} 95 96 # Capturing training indices while maintaining original class proportions 97 train_samples = {class_ : seed_obj.choice(class_positions[class_], 98 int(len(class_positions[class_]) 99 * train_ratio), 100 replace = False) 101 for class_ in class_positions.keys()} 102 103 # Capturing testing indices while maintaining original class proportions 104 test_samples = {class_ : np.setdiff1d(class_positions[class_], 105 train_samples[class_]) 106 for class_ in class_positions.keys()} 107 108 # Applying threshold for samples per class 109 if class_threshold == 'median': 110 # I believe this does the same as the commented code below 111 112 cells_per_class = [len(values) for values in train_samples.values()] 113 class_threshold = int(np.median(cells_per_class)) 114 # all_train = [idx for class_ in train_samples.keys() 115 # for idx in train_samples[class_]] 116 # _, class_threshold = np.unique(y[all_train], return_counts = True) 117 # class_threshold = int(np.median(class_threshold)) 118 119 # Down sample to class_threshold 120 for class_ in train_samples.keys(): 121 if len(train_samples[class_]) > class_threshold: 122 train_samples[class_] = seed_obj.choice(train_samples[class_], 123 class_threshold) 124 125 train_indices = np.array([idx for class_ in train_samples.keys() 126 for idx in train_samples[class_]]) 127 128 test_indices = np.array([idx for class_ in test_samples.keys() 129 for idx in test_samples[class_]]) 130 131 return train_indices, test_indices 132 133 134def _binary_split(y: np.ndarray, train_indices: np.ndarray | None=None, 135 train_ratio: float=0.8, 136 seed_obj: np.random._generator.Generator=np.random.default_rng(100)): 137 """ 138 Function to calculate training and testing indices for given 139 dataset. If train indices are given, it will calculate the test 140 indices. If train_indices == None, then it calculates both indices, 141 preserving the ratio of each label in y 142 143 Parameters 144 ---------- 145 y : np.ndarray 146 Numpy array of cell labels. Can have any number of classes 147 for this function. 148 149 train_indices : np.ndarray | None 150 Optional array of pre-determined training indices 151 152 train_ratio : float 153 Decimal value ratio of features in training/testing sets 154 155 seed_obj : np.random._generator.Generator 156 Numpy random state used for random processes. Can be 157 specified for reproducubility or set by default. 158 159 160 Returns 161 ------- 162 train_indices : np.ndarray 163 Array of indices of training cells. 164 165 test_indices : np.ndarray: 166 Array of indices of testing cells. 167 """ 168 # If train indices aren't provided 169 if train_indices is None: 170 171 unique_labels = np.unique(y) 172 train_indices = [] 173 174 for label in unique_labels: 175 176 # Find indices of each unique label 177 label_indices = np.where(y == label)[0] 178 179 # Sample these indices according to train ratio 180 n = int(len(label_indices) * train_ratio) 181 train_label_indices = seed_obj.choice(label_indices, n, 182 replace = False) 183 train_indices.extend(train_label_indices) 184 else: 185 assert len(train_indices) <= len(y), ("More train indices than there " 186 "are samples") 187 188 train_indices = np.array(train_indices) 189 190 # Test indices are the indices not in the train_indices 191 test_indices = np.setdiff1d(np.arange(len(y)), train_indices, 192 assume_unique = True) 193 194 return train_indices, test_indices 195 196 197def calculate_d(num_samples : int): 198 """ 199 This function calculates the optimal number of dimensions for 200 performance. See https://doi.org/10.48550/arXiv.1806.09178 for more 201 information. 202 203 Parameters 204 ---------- 205 num_samples : int 206 The number of samples in the data set including both training 207 and testing sets. 208 209 Returns 210 ------- 211 d : int 212 The optimal number of dimensions to run scMKL with the given 213 data set. 214 215 Examples 216 -------- 217 >>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz') 218 >>> 219 >>> num_cells = raw_counts.shape[0] 220 >>> d = scmkl.calculate_d(num_cells) 221 >>> d 222 161 223 """ 224 d = int(np.sqrt(num_samples)*np.log(np.log(num_samples))) 225 226 return int(np.max([d, 100])) 227 228 229def sort_samples(train_indices, test_indices): 230 """ 231 Ensures that samples in adata obj are all training, then all 232 testing. 233 234 Parameters 235 ---------- 236 train_indices : np.ndarray 237 Indices in ad.AnnData object for training. 238 239 test_indices : np.ndarray 240 Indices in ad.AnnData object for testing. 241 242 Returns 243 ------- 244 sort_idc : np.ndarray 245 Ordered indices that will sort ad.AnnData object as all 246 training samples, then all testing. 247 248 train_indices : np.ndarray 249 The new training indices given the new index order, `sort_idc`. 250 251 test_indices : np.ndarray 252 The new testing indices given the new index order, `sort_idc`. 253 """ 254 sort_idc = np.concatenate([train_indices, test_indices]) 255 256 train_indices = np.arange(0, train_indices.shape[0]) 257 test_indices = np.arange(train_indices.shape[0], 258 train_indices.shape[0] + test_indices.shape[0]) 259 260 return sort_idc, train_indices, test_indices 261 262 263def create_adata(X: scipy.sparse._csc.csc_matrix | np.ndarray | pd.DataFrame, 264 feature_names: np.ndarray, cell_labels: np.ndarray, 265 group_dict: dict, obs_names: None | np.ndarray=None, 266 scale_data: bool=True, split_data: np.ndarray | None=None, 267 D: int | None=None, remove_features: bool=True, 268 train_ratio: float=0.8, distance_metric: str='euclidean', 269 kernel_type: str='Gaussian', random_state: int=1, 270 allow_multiclass: bool = False, 271 class_threshold: str | int = 'median', 272 reduction: str | None = None, tfidf: bool = False): 273 """ 274 Function to create an AnnData object to carry all relevant 275 information going forward. 276 277 Parameters 278 ---------- 279 X : scipy.sparse.csc_matrix | np.ndarray | pd.DataFrame 280 A data matrix of cells by features (sparse array 281 recommended for large datasets). 282 283 feature_names : np.ndarray 284 Array of feature names corresponding with the features 285 in `X`. 286 287 cell_labels : np.ndarray 288 A numpy array of cell phenotypes corresponding with 289 the cells in `X`. 290 291 group_dict : dict 292 Dictionary containing feature grouping information (i.e. 293 `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 294 np.array([...]), ...}`. 295 296 obs_names : None | np.ndarray 297 The cell names corresponding to `X` to be assigned to output 298 object `.obs_names` attribute. 299 300 scale_data : bool 301 If `True`, data matrix is log transformed and standard 302 scaled. 303 304 split_data : None | np.ndarray 305 If `None`, data will be split stratified by cell labels. 306 Else, is an array of precalculated train/test split 307 corresponding to samples. Can include labels for entire 308 dataset to benchmark performance or for only training 309 data to classify unknown cell types (i.e. `np.array(['train', 310 'test', ..., 'train'])`. 311 312 D : int 313 Number of Random Fourier Features used to calculate Z. 314 Should be a positive integer. Higher values of D will 315 increase classification accuracy at the cost of computation 316 time. If set to `None`, will be calculated given number of 317 samples. 318 319 remove_features : bool 320 If `True`, will remove features from `X` and `feature_names` 321 not in `group_dict` and remove features from groupings not in 322 `feature_names`. 323 324 train_ratio : float 325 Ratio of number of training samples to entire data set. Note: 326 if a threshold is applied, the ratio training samples may 327 decrease depending on class balance and `class_threshold` 328 parameter if `allow_multiclass = True`. 329 330 distance_metric : str 331 The pairwise distance metric used to estimate sigma. Must 332 be one of the options used in `scipy.spatial.distance.cdist`. 333 334 kernel_type : str 335 The approximated kernel function used to calculate Zs. 336 Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`. 337 338 random_state : int 339 Integer random_state used to set the seed for 340 reproducibilty. 341 342 allow_multiclass : bool 343 If `False`, will ensure that cell labels are binary. 344 345 class_threshold : str | int 346 Number of samples allowed in the training data for each cell 347 class in the training data. If `'median'`, the median number 348 of cells per cell class will be the threshold for number of 349 samples per class. 350 351 reduction: str | None 352 Choose which dimension reduction technique to perform on 353 features within a group. 'svd' will run 354 `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 355 by an array of 1s down to 50 dimensions. 356 357 tfidf: bool 358 Whether to calculate TFIDF transformation on peaks within 359 groupings. 360 361 Returns 362 ------- 363 adata : ad.AnnData 364 AnnData with the following attributes and keys: 365 366 `adata.X` (array_like): 367 Data matrix. 368 369 `adata.var_names` (array_like): 370 Feature names corresponding to `adata.X`. 371 372 `adata.obs['labels']` (array_like): 373 cell classes/phenotypes from `cell_labels`. 374 375 `adata.uns['train_indices']` (array_like): 376 Indices for training data. 377 378 `adata.uns['test_indices']` (array_like) 379 Indices for testing data. 380 381 `adata.uns['group_dict']` (dict): 382 Grouping information. 383 384 `adata.uns['seed_obj']` (np.random._generator.Generator): 385 Seed object with seed equal to 100 * `random_state`. 386 387 `adata.uns['D']` (int): 388 Number of dimensions to scMKL with. 389 390 `adata.uns['scale_data']` (bool): 391 Whether or not data is log and z-score transformed. 392 393 `adata.uns['distance_metric']` (str): 394 Distance metric as given. 395 396 `adata.uns['kernel_type']` (str): 397 Kernel function as given. 398 399 `adata.uns['svd']` (bool): 400 Whether to calculate SVD reduction. 401 402 `adata.uns['tfidf']` (bool): 403 Whether to calculate TF-IDF per grouping. 404 405 Examples 406 -------- 407 >>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz') 408 >>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True) 409 >>> group_dict = np.load('hallmark_genesets.pkl', 410 >>> allow_pickle = True) 411 >>> 412 >>> adata = scmkl.create_adata(X = data_mat, 413 ... feature_names = gene_names, 414 ... group_dict = group_dict) 415 >>> adata 416 AnnData object with n_obs × n_vars = 1000 × 4341 417 obs: 'labels' 418 uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 419 'distance_metric', 'train_indices', 'test_indices' 420 """ 421 422 assert X.shape[1] == len(feature_names), ("Different number of features " 423 "in X than feature names") 424 425 if not allow_multiclass: 426 assert len(np.unique(cell_labels)) == 2, ("cell_labels must contain " 427 "2 classes") 428 if D is not None: 429 assert isinstance(D, int) and D > 0, 'D must be a positive integer' 430 431 kernel_options = ['gaussian', 'laplacian', 'cauchy'] 432 assert kernel_type.lower() in kernel_options, ("Given kernel type not " 433 "implemented. Gaussian, " 434 "Laplacian, and Cauchy " 435 "are the acceptable " 436 "types.") 437 438 # Create adata object and add column names 439 adata = ad.AnnData(X) 440 adata.var_names = feature_names 441 442 if isinstance(obs_names, (np.ndarray)): 443 adata.obs_names = obs_names 444 445 filtered_feature_names, group_dict = _filter_features(feature_names, 446 group_dict) 447 448 if remove_features: 449 warnings.filterwarnings('ignore', category = ad.ImplicitModificationWarning) 450 adata = adata[:, filtered_feature_names] 451 452 gc.collect() 453 454 # Add metadata to adata object 455 adata.uns['group_dict'] = group_dict 456 adata.uns['seed_obj'] = np.random.default_rng(100*random_state) 457 adata.uns['scale_data'] = scale_data 458 adata.uns['D'] = D if D is not None else calculate_d(adata.shape[0]) 459 adata.uns['kernel_type'] = kernel_type 460 adata.uns['distance_metric'] = distance_metric 461 adata.uns['reduction'] = reduction if isinstance(reduction, str) else 'None' 462 adata.uns['tfidf'] = tfidf 463 464 if (split_data is None): 465 assert X.shape[0] == len(cell_labels), ("Different number of cells " 466 "than labels") 467 adata.obs['labels'] = cell_labels 468 469 if (allow_multiclass == False): 470 split = _binary_split(cell_labels, 471 seed_obj = adata.uns['seed_obj'], 472 train_ratio = train_ratio) 473 train_indices, test_indices = split 474 475 elif (allow_multiclass == True): 476 split = _multi_class_split(cell_labels, 477 seed_obj = adata.uns['seed_obj'], 478 class_threshold = class_threshold, 479 train_ratio = train_ratio) 480 train_indices, test_indices = split 481 482 adata.uns['labeled_test'] = True 483 484 else: 485 x_eq_labs = X.shape[0] == len(cell_labels) 486 train_eq_labs = X.shape[0] == len(cell_labels) 487 assert x_eq_labs or train_eq_labs, ("Must give labels for all cells " 488 "or only for training cells") 489 490 train_indices = np.where(split_data == 'train')[0] 491 test_indices = np.where(split_data == 'test')[0] 492 493 if len(cell_labels) == len(train_indices): 494 495 padded_cell_labels = np.zeros((X.shape[0])).astype('object') 496 padded_cell_labels[train_indices] = cell_labels 497 padded_cell_labels[test_indices] = 'padded_test_label' 498 499 adata.obs['labels'] = padded_cell_labels 500 adata.uns['labeled_test'] = False 501 502 elif len(cell_labels) == len(split_data): 503 adata.obs['labels'] = cell_labels 504 adata.uns['labeled_test'] = True 505 506 # Ensuring all train samples are first in adata object followed by test 507 sort_idx, train_indices, test_indices = sort_samples(train_indices, 508 test_indices) 509 510 adata = adata[sort_idx] 511 512 if not isinstance(obs_names, (np.ndarray)): 513 adata.obs = adata.obs.reset_index(drop=True) 514 adata.obs.index = adata.obs.index.astype('O') 515 516 adata.uns['train_indices'] = train_indices 517 adata.uns['test_indices'] = test_indices 518 519 if not scale_data: 520 print("WARNING: Data will not be log transformed and scaled. " 521 "To change this behavior, set scale_data to True") 522 523 return adata 524 525 526def format_adata(adata: ad.AnnData | str, cell_labels: np.ndarray | str, 527 group_dict: dict | str, use_raw: bool=False, 528 scale_data: bool=True, split_data: np.ndarray | None=None, 529 D: int | None=None, remove_features: bool=True, 530 train_ratio: float=0.8, distance_metric: str='euclidean', 531 kernel_type: str='Gaussian', random_state: int=1, 532 allow_multiclass: bool = False, 533 class_threshold: str | int = 'median', 534 reduction: str | None = None, tfidf: bool = False): 535 """ 536 Function to format an `ad.AnnData` object to carry all relevant 537 information going forward. `adata.obs_names` will be retained. 538 539 **NOTE: Information not needed for running `scmkl` will be 540 removed.** 541 542 Parameters 543 ---------- 544 adata : ad.AnnData 545 Object with data for `scmkl` to be applied to. Only requirment 546 is that `.var_names` is correct and data matrix is in `adata.X` 547 or `adata.raw.X`. A h5ad file can be provided as a `str` and it 548 will be read in. 549 550 cell_labels : np.ndarray | str 551 If type `str`, the labels for `scmkl` to learn are captured 552 from `adata.obs['cell_labels']`. Else, a `np.ndarray` of cell 553 phenotypes corresponding with the cells in `adata.X`. 554 555 group_dict : dict | str 556 Dictionary containing feature grouping information (i.e. 557 `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 558 np.array([...]), ...}`. A pickle file can be provided as a `str` 559 and it will be read in. 560 561 obs_names : None | np.ndarray 562 The cell names corresponding to `X` to be assigned to output 563 object `.obs_names` attribute. 564 565 use_raw : bool 566 If `False`, will use `adata.X` to create new `adata`. Else, 567 will use `adata.raw.X`. 568 569 scale_data : bool 570 If `True`, data matrix is log transformed and standard 571 scaled. 572 573 split_data : None | np.ndarray 574 If `None`, data will be split stratified by cell labels. 575 Else, is an array of precalculated train/test split 576 corresponding to samples. Can include labels for entire 577 dataset to benchmark performance or for only training 578 data to classify unknown cell types (i.e. `np.array(['train', 579 'test', ..., 'train'])`. 580 581 D : int 582 Number of Random Fourier Features used to calculate Z. 583 Should be a positive integer. Higher values of D will 584 increase classification accuracy at the cost of computation 585 time. If set to `None`, will be calculated given number of 586 samples. 587 588 remove_features : bool 589 If `True`, will remove features from `X` and `feature_names` 590 not in `group_dict` and remove features from groupings not in 591 `feature_names`. 592 593 train_ratio : float 594 Ratio of number of training samples to entire data set. Note: 595 if a threshold is applied, the ratio training samples may 596 decrease depending on class balance and `class_threshold` 597 parameter if `allow_multiclass = True`. 598 599 distance_metric : str 600 The pairwise distance metric used to estimate sigma. Must 601 be one of the options used in `scipy.spatial.distance.cdist`. 602 603 kernel_type : str 604 The approximated kernel function used to calculate Zs. 605 Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`. 606 607 random_state : int 608 Integer random_state used to set the seed for 609 reproducibilty. 610 611 allow_multiclass : bool 612 If `False`, will ensure that cell labels are binary. 613 614 class_threshold : str | int 615 Number of samples allowed in the training data for each cell 616 class in the training data. If `'median'`, the median number 617 of cells per cell class will be the threshold for number of 618 samples per class. 619 620 reduction: str | None 621 Choose which dimension reduction technique to perform on 622 features within a group. 'svd' will run 623 `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 624 by an array of 1s down to 50 dimensions. 625 626 tfidf: bool 627 Whether to calculate TFIDF transformation on peaks within 628 groupings. 629 630 Returns 631 ------- 632 adata : ad.AnnData 633 AnnData with the following attributes and keys: 634 635 `adata.X` (array_like): 636 Data matrix. 637 638 `adata.var_names` (array_like): 639 Feature names corresponding to `adata.X`. 640 641 `adata.obs['labels']` (array_like): 642 cell classes/phenotypes from `cell_labels`. 643 644 `adata.uns['train_indices']` (array_like): 645 Indices for training data. 646 647 `adata.uns['test_indices']` (array_like) 648 Indices for testing data. 649 650 `adata.uns['group_dict']` (dict): 651 Grouping information. 652 653 `adata.uns['seed_obj']` (np.random._generator.Generator): 654 Seed object with seed equal to 100 * `random_state`. 655 656 `adata.uns['D']` (int): 657 Number of dimensions to scMKL with. 658 659 `adata.uns['scale_data']` (bool): 660 Whether or not data is log and z-score transformed. 661 662 `adata.uns['distance_metric']` (str): 663 Distance metric as given. 664 665 `adata.uns['kernel_type']` (str): 666 Kernel function as given. 667 668 `adata.uns['svd']` (bool): 669 Whether to calculate SVD reduction. 670 671 `adata.uns['tfidf']` (bool): 672 Whether to calculate TF-IDF per grouping. 673 674 Examples 675 -------- 676 >>> adata = ad.read_h5ad('MCF7_rna.h5ad') 677 >>> group_dict = np.load('hallmark_genesets.pkl', 678 >>> allow_pickle = True) 679 >>> 680 >>> 681 >>> # The labels in adata.obs we want to learn are 'celltypes' 682 >>> adata = scmkl.format_adata(adata, 'celltypes', 683 ... group_dict) 684 >>> adata 685 AnnData object with n_obs × n_vars = 1000 × 4341 686 obs: 'labels' 687 uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 688 'distance_metric', 'train_indices', 'test_indices' 689 """ 690 if str == type(adata): 691 adata = ad.read_h5ad(adata) 692 693 if str == type(group_dict): 694 group_dict = np.load(group_dict, allow_pickle=True) 695 696 if str == type(cell_labels): 697 err_msg = f"{cell_labels} is not in `adata.obs`" 698 assert cell_labels in adata.obs.keys(), err_msg 699 cell_labels = adata.obs[cell_labels].to_numpy() 700 701 if use_raw: 702 assert adata.raw, "`adata.raw` is empty, set `use_raw` to `False`" 703 X = adata.raw.X 704 else: 705 X = adata.X 706 707 adata = create_adata(X, adata.var_names.to_numpy().copy(), cell_labels, 708 group_dict, adata.obs_names.to_numpy().copy(), 709 scale_data, split_data, D, remove_features, 710 train_ratio, distance_metric, kernel_type, 711 random_state, allow_multiclass, class_threshold, 712 reduction, tfidf) 713 714 return adata
198def calculate_d(num_samples : int): 199 """ 200 This function calculates the optimal number of dimensions for 201 performance. See https://doi.org/10.48550/arXiv.1806.09178 for more 202 information. 203 204 Parameters 205 ---------- 206 num_samples : int 207 The number of samples in the data set including both training 208 and testing sets. 209 210 Returns 211 ------- 212 d : int 213 The optimal number of dimensions to run scMKL with the given 214 data set. 215 216 Examples 217 -------- 218 >>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz') 219 >>> 220 >>> num_cells = raw_counts.shape[0] 221 >>> d = scmkl.calculate_d(num_cells) 222 >>> d 223 161 224 """ 225 d = int(np.sqrt(num_samples)*np.log(np.log(num_samples))) 226 227 return int(np.max([d, 100]))
This function calculates the optimal number of dimensions for performance. See https://doi.org/10.48550/arXiv.1806.09178 for more information.
Parameters
- num_samples (int): The number of samples in the data set including both training and testing sets.
Returns
- d (int): The optimal number of dimensions to run scMKL with the given data set.
Examples
>>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz')
>>>
>>> num_cells = raw_counts.shape[0]
>>> d = scmkl.calculate_d(num_cells)
>>> d
161
230def sort_samples(train_indices, test_indices): 231 """ 232 Ensures that samples in adata obj are all training, then all 233 testing. 234 235 Parameters 236 ---------- 237 train_indices : np.ndarray 238 Indices in ad.AnnData object for training. 239 240 test_indices : np.ndarray 241 Indices in ad.AnnData object for testing. 242 243 Returns 244 ------- 245 sort_idc : np.ndarray 246 Ordered indices that will sort ad.AnnData object as all 247 training samples, then all testing. 248 249 train_indices : np.ndarray 250 The new training indices given the new index order, `sort_idc`. 251 252 test_indices : np.ndarray 253 The new testing indices given the new index order, `sort_idc`. 254 """ 255 sort_idc = np.concatenate([train_indices, test_indices]) 256 257 train_indices = np.arange(0, train_indices.shape[0]) 258 test_indices = np.arange(train_indices.shape[0], 259 train_indices.shape[0] + test_indices.shape[0]) 260 261 return sort_idc, train_indices, test_indices
Ensures that samples in adata obj are all training, then all testing.
Parameters
- train_indices (np.ndarray): Indices in ad.AnnData object for training.
- test_indices (np.ndarray): Indices in ad.AnnData object for testing.
Returns
- sort_idc (np.ndarray): Ordered indices that will sort ad.AnnData object as all training samples, then all testing.
- train_indices (np.ndarray):
The new training indices given the new index order,
sort_idc. - test_indices (np.ndarray):
The new testing indices given the new index order,
sort_idc.
264def create_adata(X: scipy.sparse._csc.csc_matrix | np.ndarray | pd.DataFrame, 265 feature_names: np.ndarray, cell_labels: np.ndarray, 266 group_dict: dict, obs_names: None | np.ndarray=None, 267 scale_data: bool=True, split_data: np.ndarray | None=None, 268 D: int | None=None, remove_features: bool=True, 269 train_ratio: float=0.8, distance_metric: str='euclidean', 270 kernel_type: str='Gaussian', random_state: int=1, 271 allow_multiclass: bool = False, 272 class_threshold: str | int = 'median', 273 reduction: str | None = None, tfidf: bool = False): 274 """ 275 Function to create an AnnData object to carry all relevant 276 information going forward. 277 278 Parameters 279 ---------- 280 X : scipy.sparse.csc_matrix | np.ndarray | pd.DataFrame 281 A data matrix of cells by features (sparse array 282 recommended for large datasets). 283 284 feature_names : np.ndarray 285 Array of feature names corresponding with the features 286 in `X`. 287 288 cell_labels : np.ndarray 289 A numpy array of cell phenotypes corresponding with 290 the cells in `X`. 291 292 group_dict : dict 293 Dictionary containing feature grouping information (i.e. 294 `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 295 np.array([...]), ...}`. 296 297 obs_names : None | np.ndarray 298 The cell names corresponding to `X` to be assigned to output 299 object `.obs_names` attribute. 300 301 scale_data : bool 302 If `True`, data matrix is log transformed and standard 303 scaled. 304 305 split_data : None | np.ndarray 306 If `None`, data will be split stratified by cell labels. 307 Else, is an array of precalculated train/test split 308 corresponding to samples. Can include labels for entire 309 dataset to benchmark performance or for only training 310 data to classify unknown cell types (i.e. `np.array(['train', 311 'test', ..., 'train'])`. 312 313 D : int 314 Number of Random Fourier Features used to calculate Z. 315 Should be a positive integer. Higher values of D will 316 increase classification accuracy at the cost of computation 317 time. If set to `None`, will be calculated given number of 318 samples. 319 320 remove_features : bool 321 If `True`, will remove features from `X` and `feature_names` 322 not in `group_dict` and remove features from groupings not in 323 `feature_names`. 324 325 train_ratio : float 326 Ratio of number of training samples to entire data set. Note: 327 if a threshold is applied, the ratio training samples may 328 decrease depending on class balance and `class_threshold` 329 parameter if `allow_multiclass = True`. 330 331 distance_metric : str 332 The pairwise distance metric used to estimate sigma. Must 333 be one of the options used in `scipy.spatial.distance.cdist`. 334 335 kernel_type : str 336 The approximated kernel function used to calculate Zs. 337 Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`. 338 339 random_state : int 340 Integer random_state used to set the seed for 341 reproducibilty. 342 343 allow_multiclass : bool 344 If `False`, will ensure that cell labels are binary. 345 346 class_threshold : str | int 347 Number of samples allowed in the training data for each cell 348 class in the training data. If `'median'`, the median number 349 of cells per cell class will be the threshold for number of 350 samples per class. 351 352 reduction: str | None 353 Choose which dimension reduction technique to perform on 354 features within a group. 'svd' will run 355 `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 356 by an array of 1s down to 50 dimensions. 357 358 tfidf: bool 359 Whether to calculate TFIDF transformation on peaks within 360 groupings. 361 362 Returns 363 ------- 364 adata : ad.AnnData 365 AnnData with the following attributes and keys: 366 367 `adata.X` (array_like): 368 Data matrix. 369 370 `adata.var_names` (array_like): 371 Feature names corresponding to `adata.X`. 372 373 `adata.obs['labels']` (array_like): 374 cell classes/phenotypes from `cell_labels`. 375 376 `adata.uns['train_indices']` (array_like): 377 Indices for training data. 378 379 `adata.uns['test_indices']` (array_like) 380 Indices for testing data. 381 382 `adata.uns['group_dict']` (dict): 383 Grouping information. 384 385 `adata.uns['seed_obj']` (np.random._generator.Generator): 386 Seed object with seed equal to 100 * `random_state`. 387 388 `adata.uns['D']` (int): 389 Number of dimensions to scMKL with. 390 391 `adata.uns['scale_data']` (bool): 392 Whether or not data is log and z-score transformed. 393 394 `adata.uns['distance_metric']` (str): 395 Distance metric as given. 396 397 `adata.uns['kernel_type']` (str): 398 Kernel function as given. 399 400 `adata.uns['svd']` (bool): 401 Whether to calculate SVD reduction. 402 403 `adata.uns['tfidf']` (bool): 404 Whether to calculate TF-IDF per grouping. 405 406 Examples 407 -------- 408 >>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz') 409 >>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True) 410 >>> group_dict = np.load('hallmark_genesets.pkl', 411 >>> allow_pickle = True) 412 >>> 413 >>> adata = scmkl.create_adata(X = data_mat, 414 ... feature_names = gene_names, 415 ... group_dict = group_dict) 416 >>> adata 417 AnnData object with n_obs × n_vars = 1000 × 4341 418 obs: 'labels' 419 uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 420 'distance_metric', 'train_indices', 'test_indices' 421 """ 422 423 assert X.shape[1] == len(feature_names), ("Different number of features " 424 "in X than feature names") 425 426 if not allow_multiclass: 427 assert len(np.unique(cell_labels)) == 2, ("cell_labels must contain " 428 "2 classes") 429 if D is not None: 430 assert isinstance(D, int) and D > 0, 'D must be a positive integer' 431 432 kernel_options = ['gaussian', 'laplacian', 'cauchy'] 433 assert kernel_type.lower() in kernel_options, ("Given kernel type not " 434 "implemented. Gaussian, " 435 "Laplacian, and Cauchy " 436 "are the acceptable " 437 "types.") 438 439 # Create adata object and add column names 440 adata = ad.AnnData(X) 441 adata.var_names = feature_names 442 443 if isinstance(obs_names, (np.ndarray)): 444 adata.obs_names = obs_names 445 446 filtered_feature_names, group_dict = _filter_features(feature_names, 447 group_dict) 448 449 if remove_features: 450 warnings.filterwarnings('ignore', category = ad.ImplicitModificationWarning) 451 adata = adata[:, filtered_feature_names] 452 453 gc.collect() 454 455 # Add metadata to adata object 456 adata.uns['group_dict'] = group_dict 457 adata.uns['seed_obj'] = np.random.default_rng(100*random_state) 458 adata.uns['scale_data'] = scale_data 459 adata.uns['D'] = D if D is not None else calculate_d(adata.shape[0]) 460 adata.uns['kernel_type'] = kernel_type 461 adata.uns['distance_metric'] = distance_metric 462 adata.uns['reduction'] = reduction if isinstance(reduction, str) else 'None' 463 adata.uns['tfidf'] = tfidf 464 465 if (split_data is None): 466 assert X.shape[0] == len(cell_labels), ("Different number of cells " 467 "than labels") 468 adata.obs['labels'] = cell_labels 469 470 if (allow_multiclass == False): 471 split = _binary_split(cell_labels, 472 seed_obj = adata.uns['seed_obj'], 473 train_ratio = train_ratio) 474 train_indices, test_indices = split 475 476 elif (allow_multiclass == True): 477 split = _multi_class_split(cell_labels, 478 seed_obj = adata.uns['seed_obj'], 479 class_threshold = class_threshold, 480 train_ratio = train_ratio) 481 train_indices, test_indices = split 482 483 adata.uns['labeled_test'] = True 484 485 else: 486 x_eq_labs = X.shape[0] == len(cell_labels) 487 train_eq_labs = X.shape[0] == len(cell_labels) 488 assert x_eq_labs or train_eq_labs, ("Must give labels for all cells " 489 "or only for training cells") 490 491 train_indices = np.where(split_data == 'train')[0] 492 test_indices = np.where(split_data == 'test')[0] 493 494 if len(cell_labels) == len(train_indices): 495 496 padded_cell_labels = np.zeros((X.shape[0])).astype('object') 497 padded_cell_labels[train_indices] = cell_labels 498 padded_cell_labels[test_indices] = 'padded_test_label' 499 500 adata.obs['labels'] = padded_cell_labels 501 adata.uns['labeled_test'] = False 502 503 elif len(cell_labels) == len(split_data): 504 adata.obs['labels'] = cell_labels 505 adata.uns['labeled_test'] = True 506 507 # Ensuring all train samples are first in adata object followed by test 508 sort_idx, train_indices, test_indices = sort_samples(train_indices, 509 test_indices) 510 511 adata = adata[sort_idx] 512 513 if not isinstance(obs_names, (np.ndarray)): 514 adata.obs = adata.obs.reset_index(drop=True) 515 adata.obs.index = adata.obs.index.astype('O') 516 517 adata.uns['train_indices'] = train_indices 518 adata.uns['test_indices'] = test_indices 519 520 if not scale_data: 521 print("WARNING: Data will not be log transformed and scaled. " 522 "To change this behavior, set scale_data to True") 523 524 return adata
Function to create an AnnData object to carry all relevant information going forward.
Parameters
- X (scipy.sparse.csc_matrix | np.ndarray | pd.DataFrame): A data matrix of cells by features (sparse array recommended for large datasets).
- feature_names (np.ndarray):
Array of feature names corresponding with the features
in
X. - cell_labels (np.ndarray):
A numpy array of cell phenotypes corresponding with
the cells in
X. - group_dict (dict):
Dictionary containing feature grouping information (i.e.
{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: np.array([...]), ...}. - obs_names (None | np.ndarray):
The cell names corresponding to
Xto be assigned to output object.obs_namesattribute. - scale_data (bool):
If
True, data matrix is log transformed and standard scaled. - split_data (None | np.ndarray):
If
None, data will be split stratified by cell labels. Else, is an array of precalculated train/test split corresponding to samples. Can include labels for entire dataset to benchmark performance or for only training data to classify unknown cell types (i.e.np.array(['train', 'test', ..., 'train']). - D (int):
Number of Random Fourier Features used to calculate Z.
Should be a positive integer. Higher values of D will
increase classification accuracy at the cost of computation
time. If set to
None, will be calculated given number of samples. - remove_features (bool):
If
True, will remove features fromXandfeature_namesnot ingroup_dictand remove features from groupings not infeature_names. - train_ratio (float):
Ratio of number of training samples to entire data set. Note:
if a threshold is applied, the ratio training samples may
decrease depending on class balance and
class_thresholdparameter ifallow_multiclass = True. - distance_metric (str):
The pairwise distance metric used to estimate sigma. Must
be one of the options used in
scipy.spatial.distance.cdist. - kernel_type (str):
The approximated kernel function used to calculate Zs.
Must be one of
'Gaussian','Laplacian', or'Cauchy'. - random_state (int): Integer random_state used to set the seed for reproducibilty.
- allow_multiclass (bool):
If
False, will ensure that cell labels are binary. - class_threshold (str | int):
Number of samples allowed in the training data for each cell
class in the training data. If
'median', the median number of cells per cell class will be the threshold for number of samples per class. - reduction (str | None):
Choose which dimension reduction technique to perform on
features within a group. 'svd' will run
sklearn.decomposition.TruncatedSVD, 'linear' will multiply by an array of 1s down to 50 dimensions. - tfidf (bool): Whether to calculate TFIDF transformation on peaks within groupings.
Returns
adata (ad.AnnData): AnnData with the following attributes and keys:
adata.X(array_like): Data matrix.adata.var_names(array_like): Feature names corresponding toadata.X.adata.obs['labels'](array_like): cell classes/phenotypes fromcell_labels.adata.uns['train_indices'](array_like): Indices for training data.adata.uns['test_indices'](array_like) Indices for testing data.adata.uns['group_dict'](dict): Grouping information.adata.uns['seed_obj'](np.random._generator.Generator): Seed object with seed equal to 100 *random_state.adata.uns['D'](int): Number of dimensions to scMKL with.adata.uns['scale_data'](bool): Whether or not data is log and z-score transformed.adata.uns['distance_metric'](str): Distance metric as given.adata.uns['kernel_type'](str): Kernel function as given.adata.uns['svd'](bool): Whether to calculate SVD reduction.adata.uns['tfidf'](bool): Whether to calculate TF-IDF per grouping.
Examples
>>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz')
>>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True)
>>> group_dict = np.load('hallmark_genesets.pkl',
>>> allow_pickle = True)
>>>
>>> adata = scmkl.create_adata(X = data_mat,
... feature_names = gene_names,
... group_dict = group_dict)
>>> adata
AnnData object with n_obs × n_vars = 1000 × 4341
obs: 'labels'
uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type',
'distance_metric', 'train_indices', 'test_indices'
527def format_adata(adata: ad.AnnData | str, cell_labels: np.ndarray | str, 528 group_dict: dict | str, use_raw: bool=False, 529 scale_data: bool=True, split_data: np.ndarray | None=None, 530 D: int | None=None, remove_features: bool=True, 531 train_ratio: float=0.8, distance_metric: str='euclidean', 532 kernel_type: str='Gaussian', random_state: int=1, 533 allow_multiclass: bool = False, 534 class_threshold: str | int = 'median', 535 reduction: str | None = None, tfidf: bool = False): 536 """ 537 Function to format an `ad.AnnData` object to carry all relevant 538 information going forward. `adata.obs_names` will be retained. 539 540 **NOTE: Information not needed for running `scmkl` will be 541 removed.** 542 543 Parameters 544 ---------- 545 adata : ad.AnnData 546 Object with data for `scmkl` to be applied to. Only requirment 547 is that `.var_names` is correct and data matrix is in `adata.X` 548 or `adata.raw.X`. A h5ad file can be provided as a `str` and it 549 will be read in. 550 551 cell_labels : np.ndarray | str 552 If type `str`, the labels for `scmkl` to learn are captured 553 from `adata.obs['cell_labels']`. Else, a `np.ndarray` of cell 554 phenotypes corresponding with the cells in `adata.X`. 555 556 group_dict : dict | str 557 Dictionary containing feature grouping information (i.e. 558 `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 559 np.array([...]), ...}`. A pickle file can be provided as a `str` 560 and it will be read in. 561 562 obs_names : None | np.ndarray 563 The cell names corresponding to `X` to be assigned to output 564 object `.obs_names` attribute. 565 566 use_raw : bool 567 If `False`, will use `adata.X` to create new `adata`. Else, 568 will use `adata.raw.X`. 569 570 scale_data : bool 571 If `True`, data matrix is log transformed and standard 572 scaled. 573 574 split_data : None | np.ndarray 575 If `None`, data will be split stratified by cell labels. 576 Else, is an array of precalculated train/test split 577 corresponding to samples. Can include labels for entire 578 dataset to benchmark performance or for only training 579 data to classify unknown cell types (i.e. `np.array(['train', 580 'test', ..., 'train'])`. 581 582 D : int 583 Number of Random Fourier Features used to calculate Z. 584 Should be a positive integer. Higher values of D will 585 increase classification accuracy at the cost of computation 586 time. If set to `None`, will be calculated given number of 587 samples. 588 589 remove_features : bool 590 If `True`, will remove features from `X` and `feature_names` 591 not in `group_dict` and remove features from groupings not in 592 `feature_names`. 593 594 train_ratio : float 595 Ratio of number of training samples to entire data set. Note: 596 if a threshold is applied, the ratio training samples may 597 decrease depending on class balance and `class_threshold` 598 parameter if `allow_multiclass = True`. 599 600 distance_metric : str 601 The pairwise distance metric used to estimate sigma. Must 602 be one of the options used in `scipy.spatial.distance.cdist`. 603 604 kernel_type : str 605 The approximated kernel function used to calculate Zs. 606 Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`. 607 608 random_state : int 609 Integer random_state used to set the seed for 610 reproducibilty. 611 612 allow_multiclass : bool 613 If `False`, will ensure that cell labels are binary. 614 615 class_threshold : str | int 616 Number of samples allowed in the training data for each cell 617 class in the training data. If `'median'`, the median number 618 of cells per cell class will be the threshold for number of 619 samples per class. 620 621 reduction: str | None 622 Choose which dimension reduction technique to perform on 623 features within a group. 'svd' will run 624 `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 625 by an array of 1s down to 50 dimensions. 626 627 tfidf: bool 628 Whether to calculate TFIDF transformation on peaks within 629 groupings. 630 631 Returns 632 ------- 633 adata : ad.AnnData 634 AnnData with the following attributes and keys: 635 636 `adata.X` (array_like): 637 Data matrix. 638 639 `adata.var_names` (array_like): 640 Feature names corresponding to `adata.X`. 641 642 `adata.obs['labels']` (array_like): 643 cell classes/phenotypes from `cell_labels`. 644 645 `adata.uns['train_indices']` (array_like): 646 Indices for training data. 647 648 `adata.uns['test_indices']` (array_like) 649 Indices for testing data. 650 651 `adata.uns['group_dict']` (dict): 652 Grouping information. 653 654 `adata.uns['seed_obj']` (np.random._generator.Generator): 655 Seed object with seed equal to 100 * `random_state`. 656 657 `adata.uns['D']` (int): 658 Number of dimensions to scMKL with. 659 660 `adata.uns['scale_data']` (bool): 661 Whether or not data is log and z-score transformed. 662 663 `adata.uns['distance_metric']` (str): 664 Distance metric as given. 665 666 `adata.uns['kernel_type']` (str): 667 Kernel function as given. 668 669 `adata.uns['svd']` (bool): 670 Whether to calculate SVD reduction. 671 672 `adata.uns['tfidf']` (bool): 673 Whether to calculate TF-IDF per grouping. 674 675 Examples 676 -------- 677 >>> adata = ad.read_h5ad('MCF7_rna.h5ad') 678 >>> group_dict = np.load('hallmark_genesets.pkl', 679 >>> allow_pickle = True) 680 >>> 681 >>> 682 >>> # The labels in adata.obs we want to learn are 'celltypes' 683 >>> adata = scmkl.format_adata(adata, 'celltypes', 684 ... group_dict) 685 >>> adata 686 AnnData object with n_obs × n_vars = 1000 × 4341 687 obs: 'labels' 688 uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 689 'distance_metric', 'train_indices', 'test_indices' 690 """ 691 if str == type(adata): 692 adata = ad.read_h5ad(adata) 693 694 if str == type(group_dict): 695 group_dict = np.load(group_dict, allow_pickle=True) 696 697 if str == type(cell_labels): 698 err_msg = f"{cell_labels} is not in `adata.obs`" 699 assert cell_labels in adata.obs.keys(), err_msg 700 cell_labels = adata.obs[cell_labels].to_numpy() 701 702 if use_raw: 703 assert adata.raw, "`adata.raw` is empty, set `use_raw` to `False`" 704 X = adata.raw.X 705 else: 706 X = adata.X 707 708 adata = create_adata(X, adata.var_names.to_numpy().copy(), cell_labels, 709 group_dict, adata.obs_names.to_numpy().copy(), 710 scale_data, split_data, D, remove_features, 711 train_ratio, distance_metric, kernel_type, 712 random_state, allow_multiclass, class_threshold, 713 reduction, tfidf) 714 715 return adata
Function to format an ad.AnnData object to carry all relevant
information going forward. adata.obs_names will be retained.
NOTE: Information not needed for running scmkl will be
removed.
Parameters
- adata (ad.AnnData):
Object with data for
scmklto be applied to. Only requirment is that.var_namesis correct and data matrix is inadata.Xoradata.raw.X. A h5ad file can be provided as astrand it will be read in. - cell_labels (np.ndarray | str):
If type
str, the labels forscmklto learn are captured fromadata.obs['cell_labels']. Else, anp.ndarrayof cell phenotypes corresponding with the cells inadata.X. - group_dict (dict | str):
Dictionary containing feature grouping information (i.e.
{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: np.array([...]), ...}. A pickle file can be provided as astrand it will be read in. - obs_names (None | np.ndarray):
The cell names corresponding to
Xto be assigned to output object.obs_namesattribute. - use_raw (bool):
If
False, will useadata.Xto create newadata. Else, will useadata.raw.X. - scale_data (bool):
If
True, data matrix is log transformed and standard scaled. - split_data (None | np.ndarray):
If
None, data will be split stratified by cell labels. Else, is an array of precalculated train/test split corresponding to samples. Can include labels for entire dataset to benchmark performance or for only training data to classify unknown cell types (i.e.np.array(['train', 'test', ..., 'train']). - D (int):
Number of Random Fourier Features used to calculate Z.
Should be a positive integer. Higher values of D will
increase classification accuracy at the cost of computation
time. If set to
None, will be calculated given number of samples. - remove_features (bool):
If
True, will remove features fromXandfeature_namesnot ingroup_dictand remove features from groupings not infeature_names. - train_ratio (float):
Ratio of number of training samples to entire data set. Note:
if a threshold is applied, the ratio training samples may
decrease depending on class balance and
class_thresholdparameter ifallow_multiclass = True. - distance_metric (str):
The pairwise distance metric used to estimate sigma. Must
be one of the options used in
scipy.spatial.distance.cdist. - kernel_type (str):
The approximated kernel function used to calculate Zs.
Must be one of
'Gaussian','Laplacian', or'Cauchy'. - random_state (int): Integer random_state used to set the seed for reproducibilty.
- allow_multiclass (bool):
If
False, will ensure that cell labels are binary. - class_threshold (str | int):
Number of samples allowed in the training data for each cell
class in the training data. If
'median', the median number of cells per cell class will be the threshold for number of samples per class. - reduction (str | None):
Choose which dimension reduction technique to perform on
features within a group. 'svd' will run
sklearn.decomposition.TruncatedSVD, 'linear' will multiply by an array of 1s down to 50 dimensions. - tfidf (bool): Whether to calculate TFIDF transformation on peaks within groupings.
Returns
adata (ad.AnnData): AnnData with the following attributes and keys:
adata.X(array_like): Data matrix.adata.var_names(array_like): Feature names corresponding toadata.X.adata.obs['labels'](array_like): cell classes/phenotypes fromcell_labels.adata.uns['train_indices'](array_like): Indices for training data.adata.uns['test_indices'](array_like) Indices for testing data.adata.uns['group_dict'](dict): Grouping information.adata.uns['seed_obj'](np.random._generator.Generator): Seed object with seed equal to 100 *random_state.adata.uns['D'](int): Number of dimensions to scMKL with.adata.uns['scale_data'](bool): Whether or not data is log and z-score transformed.adata.uns['distance_metric'](str): Distance metric as given.adata.uns['kernel_type'](str): Kernel function as given.adata.uns['svd'](bool): Whether to calculate SVD reduction.adata.uns['tfidf'](bool): Whether to calculate TF-IDF per grouping.
Examples
>>> adata = ad.read_h5ad('MCF7_rna.h5ad')
>>> group_dict = np.load('hallmark_genesets.pkl',
>>> allow_pickle = True)
>>>
>>>
>>> # The labels in adata.obs we want to learn are 'celltypes'
>>> adata = scmkl.format_adata(adata, 'celltypes',
... group_dict)
>>> adata
AnnData object with n_obs × n_vars = 1000 × 4341
obs: 'labels'
uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type',
'distance_metric', 'train_indices', 'test_indices'