scmkl.create_adata
1import numpy as np 2import anndata as ad 3import scipy 4import pandas as pd 5import gc 6import warnings 7 8 9def _filter_features(feature_names: np.ndarray, group_dict: dict): 10 """ 11 Function to remove features only in feature names or group_dict. 12 Any features not included in group_dict will be removed from the 13 matrix. Also puts the features in the same relative order (of 14 included features) 15 16 Parameters 17 ---------- 18 feature_names : np.ndarray 19 Numpy array of corresponding feature names. 20 21 group_dict : dict 22 Dictionary containing feature grouping information. 23 Example: {geneset: np.array(gene_1, gene_2, ..., 24 gene_n)} 25 Returns 26 ------- 27 feature_names : np.ndarray 28 Numpy array of corresponding feature names from group_dict. 29 30 group_dict : dict 31 Dictionary containing features overlapping input grouping 32 information and full feature names. 33 """ 34 group_features = set() 35 feature_set = set(feature_names) 36 37 # Store all objects in dictionary in set 38 for group in group_dict.keys(): 39 group_features.update(set(group_dict[group])) 40 41 # Finds intersection between group features and features in data 42 # Converts to nd.array and sorts to preserve order of feature names 43 group_feats = list(feature_set.intersection(set(group_dict[group]))) 44 group_dict[group] = np.sort(np.array(group_feats)) 45 46 # Only keeping groupings that have at least two features 47 group_dict = {group : group_dict[group] for group in group_dict.keys() 48 if len(group_dict[group]) > 1} 49 50 group_features = np.array(list(group_features.intersection(feature_set))) 51 52 return group_features, group_dict 53 54 55def _multi_class_split(y: np.ndarray, train_ratio: float=0.8, class_threshold: str | int='median', 56 seed_obj: np.random._generator.Generator=np.random.default_rng(100)): 57 """ 58 Function for calculating the training and testing cell positions 59 for multiclass data sets. 60 61 Parameters 62 ---------- 63 y : array_like 64 Should be an iterable object cooresponding to samples in 65 `ad.AnnData` object. 66 67 seed_obj : np.random._generator.Generator 68 Seed used to randomly sample and split data. 69 70 train_ratio : float 71 Ratio of number of training samples to entire data set. 72 Note: if a threshold is applied, the ratio training samples 73 may decrease depending on class balance and `class_threshold` 74 parameter. 75 76 class_threshold : str | int 77 If is type `int`, classes with more samples than 78 class_threshold will be sampled. If `'median'`, 79 samples will be sampled to the median number of samples per 80 class. 81 82 Returns 83 ------- 84 train_indices : np.ndarray 85 Indices for training samples. 86 87 test_indices : np.ndarray 88 Indices for testing samples. 89 """ 90 uniq_labels = np.unique(y) 91 92 # Finding indices for each cell class 93 class_positions = {class_ : np.where(y == class_)[0] 94 for class_ in uniq_labels} 95 96 # Capturing training indices while maintaining original class proportions 97 train_samples = {class_ : seed_obj.choice(class_positions[class_], 98 int(len(class_positions[class_]) 99 * train_ratio), 100 replace = False) 101 for class_ in class_positions.keys()} 102 103 # Capturing testing indices while maintaining original class proportions 104 test_samples = {class_ : np.setdiff1d(class_positions[class_], 105 train_samples[class_]) 106 for class_ in class_positions.keys()} 107 108 # Applying threshold for samples per class 109 if class_threshold == 'median': 110 # I believe this does the same as the commented code below 111 112 cells_per_class = [len(values) for values in train_samples.values()] 113 class_threshold = int(np.median(cells_per_class)) 114 # all_train = [idx for class_ in train_samples.keys() 115 # for idx in train_samples[class_]] 116 # _, class_threshold = np.unique(y[all_train], return_counts = True) 117 # class_threshold = int(np.median(class_threshold)) 118 119 # Down sample to class_threshold 120 for class_ in train_samples.keys(): 121 if len(train_samples[class_]) > class_threshold: 122 train_samples[class_] = seed_obj.choice(train_samples[class_], 123 class_threshold) 124 125 train_indices = np.array([idx for class_ in train_samples.keys() 126 for idx in train_samples[class_]]) 127 128 test_indices = np.array([idx for class_ in test_samples.keys() 129 for idx in test_samples[class_]]) 130 131 return train_indices, test_indices 132 133 134def _binary_split(y: np.ndarray, train_indices: np.ndarray | None=None, 135 train_ratio: float=0.8, 136 seed_obj: np.random._generator.Generator=np.random.default_rng(100)): 137 """ 138 Function to calculate training and testing indices for given 139 dataset. If train indices are given, it will calculate the test 140 indices. If train_indices == None, then it calculates both indices, 141 preserving the ratio of each label in y 142 143 Parameters 144 ---------- 145 y : np.ndarray 146 Numpy array of cell labels. Can have any number of classes 147 for this function. 148 149 train_indices : np.ndarray | None 150 Optional array of pre-determined training indices 151 152 train_ratio : float 153 Decimal value ratio of features in training/testing sets 154 155 seed_obj : np.random._generator.Generator 156 Numpy random state used for random processes. Can be 157 specified for reproducubility or set by default. 158 159 160 Returns 161 ------- 162 train_indices : np.ndarray 163 Array of indices of training cells. 164 165 test_indices : np.ndarray: 166 Array of indices of testing cells. 167 """ 168 # If train indices aren't provided 169 if train_indices is None: 170 171 unique_labels = np.unique(y) 172 train_indices = [] 173 174 for label in unique_labels: 175 176 # Find indices of each unique label 177 label_indices = np.where(y == label)[0] 178 179 # Sample these indices according to train ratio 180 n = int(len(label_indices) * train_ratio) 181 train_label_indices = seed_obj.choice(label_indices, n, 182 replace = False) 183 train_indices.extend(train_label_indices) 184 else: 185 assert len(train_indices) <= len(y), ("More train indices than there " 186 "are samples") 187 188 train_indices = np.array(train_indices) 189 190 # Test indices are the indices not in the train_indices 191 test_indices = np.setdiff1d(np.arange(len(y)), train_indices, 192 assume_unique = True) 193 194 return train_indices, test_indices 195 196 197def calculate_d(num_samples : int): 198 """ 199 This function calculates the optimal number of dimensions for 200 performance. See https://doi.org/10.48550/arXiv.1806.09178 for more 201 information. 202 203 Parameters 204 ---------- 205 num_samples : int 206 The number of samples in the data set including both training 207 and testing sets. 208 209 Returns 210 ------- 211 d : int 212 The optimal number of dimensions to run scMKL with the given 213 data set. 214 215 Examples 216 -------- 217 >>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz') 218 >>> 219 >>> num_cells = raw_counts.shape[0] 220 >>> d = scmkl.calculate_d(num_cells) 221 >>> d 222 161 223 """ 224 d = int(np.sqrt(num_samples)*np.log(np.log(num_samples))) 225 226 return int(np.max([d, 100])) 227 228 229def sort_samples(train_indices, test_indices): 230 """ 231 Ensures that samples in adata obj are all training, then all 232 testing. 233 234 Parameters 235 ---------- 236 train_indices : np.ndarray 237 Indices in ad.AnnData object for training. 238 239 test_indices : np.ndarray 240 Indices in ad.AnnData object for testing. 241 242 Returns 243 ------- 244 sort_idc : np.ndarray 245 Ordered indices that will sort ad.AnnData object as all 246 training samples, then all testing. 247 248 train_indices : np.ndarray 249 The new training indices given the new index order, `sort_idc`. 250 251 test_indices : np.ndarray 252 The new testing indices given the new index order, `sort_idc`. 253 """ 254 sort_idc = np.concatenate([train_indices, test_indices]) 255 256 train_indices = np.arange(0, train_indices.shape[0]) 257 test_indices = np.arange(train_indices.shape[0], 258 train_indices.shape[0] + test_indices.shape[0]) 259 260 return sort_idc, train_indices, test_indices 261 262 263def create_adata(X: scipy.sparse._csc.csc_matrix | np.ndarray | pd.DataFrame, 264 feature_names: np.ndarray, cell_labels: np.ndarray, 265 group_dict: dict, obs_names: None | np.ndarray=None, 266 scale_data: bool=True, split_data: np.ndarray | None=None, 267 D: int | None=None, remove_features: bool=True, 268 train_ratio: float=0.8, distance_metric: str='euclidean', 269 kernel_type: str='Gaussian', random_state: int=1, 270 allow_multiclass: bool = False, 271 class_threshold: str | int = 'median', 272 reduction: str | None = None, tfidf: bool = False): 273 """ 274 Function to create an AnnData object to carry all relevant 275 information going forward. 276 277 Parameters 278 ---------- 279 X : scipy.sparse.csc_matrix | np.ndarray | pd.DataFrame 280 A data matrix of cells by features (sparse array 281 recommended for large datasets). 282 283 feature_names : np.ndarray 284 Array of feature names corresponding with the features 285 in `X`. 286 287 cell_labels : np.ndarray 288 A numpy array of cell phenotypes corresponding with 289 the cells in `X`. 290 291 group_dict : dict 292 Dictionary containing feature grouping information (i.e. 293 `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 294 np.array([...]), ...}`. 295 296 obs_names : None | np.ndarray 297 The cell names corresponding to `X` to be assigned to output 298 object `.obs_names` attribute. 299 300 scale_data : bool 301 If `True`, data matrix is log transformed and standard 302 scaled. 303 304 split_data : None | np.ndarray 305 If `None`, data will be split stratified by cell labels. 306 Else, is an array of precalculated train/test split 307 corresponding to samples. Can include labels for entire 308 dataset to benchmark performance or for only training 309 data to classify unknown cell types (i.e. `np.array(['train', 310 'test', ..., 'train'])`. 311 312 D : int 313 Number of Random Fourier Features used to calculate Z. 314 Should be a positive integer. Higher values of D will 315 increase classification accuracy at the cost of computation 316 time. If set to `None`, will be calculated given number of 317 samples. 318 319 remove_features : bool 320 If `True`, will remove features from `X` and `feature_names` 321 not in `group_dict` and remove features from groupings not in 322 `feature_names`. 323 324 train_ratio : float 325 Ratio of number of training samples to entire data set. Note: 326 if a threshold is applied, the ratio training samples may 327 decrease depending on class balance and `class_threshold` 328 parameter if `allow_multiclass = True`. 329 330 distance_metric : str 331 The pairwise distance metric used to estimate sigma. Must 332 be one of the options used in `scipy.spatial.distance.cdist`. 333 334 kernel_type : str 335 The approximated kernel function used to calculate Zs. 336 Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`. 337 338 random_state : int 339 Integer random_state used to set the seed for 340 reproducibilty. 341 342 allow_multiclass : bool 343 If `False`, will ensure that cell labels are binary. 344 345 class_threshold : str | int 346 Number of samples allowed in the training data for each cell 347 class in the training data. If `'median'`, the median number 348 of cells per cell class will be the threshold for number of 349 samples per class. 350 351 reduction: str | None 352 Choose which dimension reduction technique to perform on 353 features within a group. 'svd' will run 354 `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 355 by an array of 1s down to 50 dimensions. 356 357 tfidf: bool 358 Whether to calculate TFIDF transformation on peaks within 359 groupings. 360 361 Returns 362 ------- 363 adata : ad.AnnData 364 AnnData with the following attributes and keys: 365 366 `adata.X` (array_like): 367 Data matrix. 368 369 `adata.var_names` (array_like): 370 Feature names corresponding to `adata.X`. 371 372 `adata.obs['labels']` (array_like): 373 cell classes/phenotypes from `cell_labels`. 374 375 `adata.uns['train_indices']` (array_like): 376 Indices for training data. 377 378 `adata.uns['test_indices']` (array_like) 379 Indices for testing data. 380 381 `adata.uns['group_dict']` (dict): 382 Grouping information. 383 384 `adata.uns['seed_obj']` (np.random._generator.Generator): 385 Seed object with seed equal to 100 * `random_state`. 386 387 `adata.uns['D']` (int): 388 Number of dimensions to scMKL with. 389 390 `adata.uns['scale_data']` (bool): 391 Whether or not data is log and z-score transformed. 392 393 `adata.uns['distance_metric']` (str): 394 Distance metric as given. 395 396 `adata.uns['kernel_type']` (str): 397 Kernel function as given. 398 399 `adata.uns['svd']` (bool): 400 Whether to calculate SVD reduction. 401 402 `adata.uns['tfidf']` (bool): 403 Whether to calculate TF-IDF per grouping. 404 405 Examples 406 -------- 407 >>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz') 408 >>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True) 409 >>> group_dict = np.load('hallmark_genesets.pkl', 410 >>> allow_pickle = True) 411 >>> 412 >>> adata = scmkl.create_adata(X = data_mat, 413 ... feature_names = gene_names, 414 ... group_dict = group_dict) 415 >>> adata 416 AnnData object with n_obs × n_vars = 1000 × 4341 417 obs: 'labels' 418 uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 419 'distance_metric', 'train_indices', 'test_indices' 420 """ 421 422 assert X.shape[1] == len(feature_names), ("Different number of features " 423 "in X than feature names") 424 425 if not allow_multiclass: 426 assert len(np.unique(cell_labels)) == 2, ("cell_labels must contain " 427 "2 classes") 428 if D is not None: 429 assert isinstance(D, int) and D > 0, 'D must be a positive integer' 430 431 kernel_options = ['gaussian', 'laplacian', 'cauchy'] 432 assert kernel_type.lower() in kernel_options, ("Given kernel type not " 433 "implemented. Gaussian, " 434 "Laplacian, and Cauchy " 435 "are the acceptable " 436 "types.") 437 438 # Create adata object and add column names 439 adata = ad.AnnData(X) 440 adata.var_names = feature_names 441 442 if isinstance(obs_names, (np.ndarray)): 443 adata.obs_names = obs_names 444 445 filtered_feature_names, group_dict = _filter_features(feature_names, 446 group_dict) 447 448 if remove_features: 449 warnings.filterwarnings('ignore', category = ad.ImplicitModificationWarning) 450 adata = adata[:, filtered_feature_names] 451 452 gc.collect() 453 454 # Add metadata to adata object 455 adata.uns['group_dict'] = group_dict 456 adata.uns['seed_obj'] = np.random.default_rng(100*random_state) 457 adata.uns['scale_data'] = scale_data 458 adata.uns['D'] = D if D is not None else calculate_d(adata.shape[0]) 459 adata.uns['kernel_type'] = kernel_type 460 adata.uns['distance_metric'] = distance_metric 461 adata.uns['reduction'] = reduction if isinstance(reduction, str) else 'None' 462 adata.uns['tfidf'] = tfidf 463 464 if (split_data is None): 465 assert X.shape[0] == len(cell_labels), ("Different number of cells " 466 "than labels") 467 adata.obs['labels'] = cell_labels 468 469 if (allow_multiclass == False): 470 split = _binary_split(cell_labels, 471 seed_obj = adata.uns['seed_obj'], 472 train_ratio = train_ratio) 473 train_indices, test_indices = split 474 475 elif (allow_multiclass == True): 476 split = _multi_class_split(cell_labels, 477 seed_obj = adata.uns['seed_obj'], 478 class_threshold = class_threshold, 479 train_ratio = train_ratio) 480 train_indices, test_indices = split 481 482 adata.uns['labeled_test'] = True 483 484 else: 485 x_eq_labs = X.shape[0] == len(cell_labels) 486 train_eq_labs = X.shape[0] == len(cell_labels) 487 assert x_eq_labs or train_eq_labs, ("Must give labels for all cells " 488 "or only for training cells") 489 490 train_indices = np.where(split_data == 'train')[0] 491 test_indices = np.where(split_data == 'test')[0] 492 493 if len(cell_labels) == len(train_indices): 494 495 padded_cell_labels = np.zeros((X.shape[0])).astype('object') 496 padded_cell_labels[train_indices] = cell_labels 497 padded_cell_labels[test_indices] = 'padded_test_label' 498 499 adata.obs['labels'] = padded_cell_labels 500 adata.uns['labeled_test'] = False 501 502 elif len(cell_labels) == len(split_data): 503 adata.obs['labels'] = cell_labels 504 adata.uns['labeled_test'] = True 505 506 # Ensuring all train samples are first in adata object followed by test 507 sort_idx, train_indices, test_indices = sort_samples(train_indices, 508 test_indices) 509 510 adata = adata[sort_idx] 511 512 if not isinstance(obs_names, (np.ndarray)): 513 adata.obs = adata.obs.reset_index(drop=True) 514 adata.obs.index = adata.obs.index.astype('O') 515 516 adata.uns['train_indices'] = train_indices 517 adata.uns['test_indices'] = test_indices 518 519 if not scale_data: 520 print("WARNING: Data will not be log transformed and scaled. " 521 "To change this behavior, set scale_data to True") 522 523 return adata 524 525 526def format_adata(adata: ad.AnnData | str, cell_labels: np.ndarray | str, 527 group_dict: dict | str, use_raw: bool=False, 528 scale_data: bool=True, split_data: np.ndarray | None=None, 529 D: int | None=None, remove_features: bool=True, 530 train_ratio: float=0.8, distance_metric: str='euclidean', 531 kernel_type: str='Gaussian', random_state: int=1, 532 allow_multiclass: bool = False, 533 class_threshold: str | int = 'median', 534 reduction: str | None = None, tfidf: bool = False): 535 """ 536 Function to format an `ad.AnnData` object to carry all relevant 537 information going forward. `adata.obs_names` will be retained. 538 539 **NOTE: Information not needed for running `scmkl` will be 540 removed.** 541 542 Parameters 543 ---------- 544 adata : ad.AnnData 545 Object with data for `scmkl` to be applied to. Only requirment 546 is that `.var_names` is correct and data matrix is in `adata.X` 547 or `adata.raw.X`. A h5ad file can be provided as a `str` and it 548 will be read in. 549 550 cell_labels : np.ndarray | str 551 If type `str`, the labels for `scmkl` to learn are captured 552 from `adata.obs['cell_labels']`. Else, a `np.ndarray` of cell 553 phenotypes corresponding with the cells in `adata.X`. 554 555 group_dict : dict | str 556 Dictionary containing feature grouping information (i.e. 557 `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 558 np.array([...]), ...}`. A pickle file can be provided as a `str` 559 and it will be read in. 560 561 obs_names : None | np.ndarray 562 The cell names corresponding to `X` to be assigned to output 563 object `.obs_names` attribute. 564 565 use_raw : bool 566 If `False`, will use `adata.X` to create new `adata`. Else, 567 will use `adata.raw.X`. 568 569 scale_data : bool 570 If `True`, data matrix is log transformed and standard 571 scaled. 572 573 split_data : None | np.ndarray 574 If `None`, data will be split stratified by cell labels. 575 Else, is an array of precalculated train/test split 576 corresponding to samples. Can include labels for entire 577 dataset to benchmark performance or for only training 578 data to classify unknown cell types (i.e. `np.array(['train', 579 'test', ..., 'train'])`. 580 581 D : int 582 Number of Random Fourier Features used to calculate Z. 583 Should be a positive integer. Higher values of D will 584 increase classification accuracy at the cost of computation 585 time. If set to `None`, will be calculated given number of 586 samples. 587 588 remove_features : bool 589 If `True`, will remove features from `X` and `feature_names` 590 not in `group_dict` and remove features from groupings not in 591 `feature_names`. 592 593 train_ratio : float 594 Ratio of number of training samples to entire data set. Note: 595 if a threshold is applied, the ratio training samples may 596 decrease depending on class balance and `class_threshold` 597 parameter if `allow_multiclass = True`. 598 599 distance_metric : str 600 The pairwise distance metric used to estimate sigma. Must 601 be one of the options used in `scipy.spatial.distance.cdist`. 602 603 kernel_type : str 604 The approximated kernel function used to calculate Zs. 605 Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`. 606 607 random_state : int 608 Integer random_state used to set the seed for 609 reproducibilty. 610 611 allow_multiclass : bool 612 If `False`, will ensure that cell labels are binary. 613 614 class_threshold : str | int 615 Number of samples allowed in the training data for each cell 616 class in the training data. If `'median'`, the median number 617 of cells per cell class will be the threshold for number of 618 samples per class. 619 620 reduction: str | None 621 Choose which dimension reduction technique to perform on 622 features within a group. 'svd' will run 623 `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 624 by an array of 1s down to 50 dimensions. 625 626 tfidf: bool 627 Whether to calculate TFIDF transformation on peaks within 628 groupings. 629 630 Returns 631 ------- 632 adata : ad.AnnData 633 AnnData with the following attributes and keys: 634 635 `adata.X` (array_like): 636 Data matrix. 637 638 `adata.var_names` (array_like): 639 Feature names corresponding to `adata.X`. 640 641 `adata.obs['labels']` (array_like): 642 cell classes/phenotypes from `cell_labels`. 643 644 `adata.uns['train_indices']` (array_like): 645 Indices for training data. 646 647 `adata.uns['test_indices']` (array_like) 648 Indices for testing data. 649 650 `adata.uns['group_dict']` (dict): 651 Grouping information. 652 653 `adata.uns['seed_obj']` (np.random._generator.Generator): 654 Seed object with seed equal to 100 * `random_state`. 655 656 `adata.uns['D']` (int): 657 Number of dimensions to scMKL with. 658 659 `adata.uns['scale_data']` (bool): 660 Whether or not data is log and z-score transformed. 661 662 `adata.uns['distance_metric']` (str): 663 Distance metric as given. 664 665 `adata.uns['kernel_type']` (str): 666 Kernel function as given. 667 668 `adata.uns['svd']` (bool): 669 Whether to calculate SVD reduction. 670 671 `adata.uns['tfidf']` (bool): 672 Whether to calculate TF-IDF per grouping. 673 674 Examples 675 -------- 676 >>> adata = ad.read_h5ad('MCF7_rna.h5ad') 677 >>> group_dict = np.load('hallmark_genesets.pkl', 678 >>> allow_pickle = True) 679 >>> 680 >>> 681 >>> # The labels in adata.obs we want to learn are 'celltypes' 682 >>> adata = scmkl.format_adata(adata, 'celltypes', 683 ... group_dict) 684 >>> adata 685 AnnData object with n_obs × n_vars = 1000 × 4341 686 obs: 'labels' 687 uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 688 'distance_metric', 'train_indices', 'test_indices' 689 """ 690 if str == type(adata): 691 adata = ad.read_h5ad(adata) 692 693 if str == type(group_dict): 694 group_dict = np.load(group_dict, allow_pickle=True) 695 696 if str == type(cell_labels): 697 err_msg = f"{cell_labels} is not in `adata.obs`" 698 assert cell_labels in adata.obs.keys(), err_msg 699 cell_labels = adata.obs[cell_labels].to_numpy() 700 701 if use_raw: 702 assert adata.raw, "`adata.raw` is empty, set `use_raw` to `False`" 703 X = adata.raw.X 704 else: 705 X = adata.X 706 707 adata = create_adata(X, adata.var_names.to_numpy().copy(), cell_labels, 708 group_dict, adata.obs_names.to_numpy().copy(), 709 scale_data, split_data, D, remove_features, 710 train_ratio, distance_metric, kernel_type, 711 random_state, allow_multiclass, class_threshold, 712 reduction, tfidf) 713 714 return adata
198def calculate_d(num_samples : int): 199 """ 200 This function calculates the optimal number of dimensions for 201 performance. See https://doi.org/10.48550/arXiv.1806.09178 for more 202 information. 203 204 Parameters 205 ---------- 206 num_samples : int 207 The number of samples in the data set including both training 208 and testing sets. 209 210 Returns 211 ------- 212 d : int 213 The optimal number of dimensions to run scMKL with the given 214 data set. 215 216 Examples 217 -------- 218 >>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz') 219 >>> 220 >>> num_cells = raw_counts.shape[0] 221 >>> d = scmkl.calculate_d(num_cells) 222 >>> d 223 161 224 """ 225 d = int(np.sqrt(num_samples)*np.log(np.log(num_samples))) 226 227 return int(np.max([d, 100]))
This function calculates the optimal number of dimensions for performance. See https://doi.org/10.48550/arXiv.1806.09178 for more information.
Parameters
- num_samples (int): The number of samples in the data set including both training and testing sets.
Returns
- d (int): The optimal number of dimensions to run scMKL with the given data set.
Examples
>>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz')
>>>
>>> num_cells = raw_counts.shape[0]
>>> d = scmkl.calculate_d(num_cells)
>>> d
161
230def sort_samples(train_indices, test_indices): 231 """ 232 Ensures that samples in adata obj are all training, then all 233 testing. 234 235 Parameters 236 ---------- 237 train_indices : np.ndarray 238 Indices in ad.AnnData object for training. 239 240 test_indices : np.ndarray 241 Indices in ad.AnnData object for testing. 242 243 Returns 244 ------- 245 sort_idc : np.ndarray 246 Ordered indices that will sort ad.AnnData object as all 247 training samples, then all testing. 248 249 train_indices : np.ndarray 250 The new training indices given the new index order, `sort_idc`. 251 252 test_indices : np.ndarray 253 The new testing indices given the new index order, `sort_idc`. 254 """ 255 sort_idc = np.concatenate([train_indices, test_indices]) 256 257 train_indices = np.arange(0, train_indices.shape[0]) 258 test_indices = np.arange(train_indices.shape[0], 259 train_indices.shape[0] + test_indices.shape[0]) 260 261 return sort_idc, train_indices, test_indices
Ensures that samples in adata obj are all training, then all testing.
Parameters
- train_indices (np.ndarray): Indices in ad.AnnData object for training.
- test_indices (np.ndarray): Indices in ad.AnnData object for testing.
Returns
- sort_idc (np.ndarray): Ordered indices that will sort ad.AnnData object as all training samples, then all testing.
- train_indices (np.ndarray):
The new training indices given the new index order,
sort_idc
. - test_indices (np.ndarray):
The new testing indices given the new index order,
sort_idc
.
264def create_adata(X: scipy.sparse._csc.csc_matrix | np.ndarray | pd.DataFrame, 265 feature_names: np.ndarray, cell_labels: np.ndarray, 266 group_dict: dict, obs_names: None | np.ndarray=None, 267 scale_data: bool=True, split_data: np.ndarray | None=None, 268 D: int | None=None, remove_features: bool=True, 269 train_ratio: float=0.8, distance_metric: str='euclidean', 270 kernel_type: str='Gaussian', random_state: int=1, 271 allow_multiclass: bool = False, 272 class_threshold: str | int = 'median', 273 reduction: str | None = None, tfidf: bool = False): 274 """ 275 Function to create an AnnData object to carry all relevant 276 information going forward. 277 278 Parameters 279 ---------- 280 X : scipy.sparse.csc_matrix | np.ndarray | pd.DataFrame 281 A data matrix of cells by features (sparse array 282 recommended for large datasets). 283 284 feature_names : np.ndarray 285 Array of feature names corresponding with the features 286 in `X`. 287 288 cell_labels : np.ndarray 289 A numpy array of cell phenotypes corresponding with 290 the cells in `X`. 291 292 group_dict : dict 293 Dictionary containing feature grouping information (i.e. 294 `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 295 np.array([...]), ...}`. 296 297 obs_names : None | np.ndarray 298 The cell names corresponding to `X` to be assigned to output 299 object `.obs_names` attribute. 300 301 scale_data : bool 302 If `True`, data matrix is log transformed and standard 303 scaled. 304 305 split_data : None | np.ndarray 306 If `None`, data will be split stratified by cell labels. 307 Else, is an array of precalculated train/test split 308 corresponding to samples. Can include labels for entire 309 dataset to benchmark performance or for only training 310 data to classify unknown cell types (i.e. `np.array(['train', 311 'test', ..., 'train'])`. 312 313 D : int 314 Number of Random Fourier Features used to calculate Z. 315 Should be a positive integer. Higher values of D will 316 increase classification accuracy at the cost of computation 317 time. If set to `None`, will be calculated given number of 318 samples. 319 320 remove_features : bool 321 If `True`, will remove features from `X` and `feature_names` 322 not in `group_dict` and remove features from groupings not in 323 `feature_names`. 324 325 train_ratio : float 326 Ratio of number of training samples to entire data set. Note: 327 if a threshold is applied, the ratio training samples may 328 decrease depending on class balance and `class_threshold` 329 parameter if `allow_multiclass = True`. 330 331 distance_metric : str 332 The pairwise distance metric used to estimate sigma. Must 333 be one of the options used in `scipy.spatial.distance.cdist`. 334 335 kernel_type : str 336 The approximated kernel function used to calculate Zs. 337 Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`. 338 339 random_state : int 340 Integer random_state used to set the seed for 341 reproducibilty. 342 343 allow_multiclass : bool 344 If `False`, will ensure that cell labels are binary. 345 346 class_threshold : str | int 347 Number of samples allowed in the training data for each cell 348 class in the training data. If `'median'`, the median number 349 of cells per cell class will be the threshold for number of 350 samples per class. 351 352 reduction: str | None 353 Choose which dimension reduction technique to perform on 354 features within a group. 'svd' will run 355 `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 356 by an array of 1s down to 50 dimensions. 357 358 tfidf: bool 359 Whether to calculate TFIDF transformation on peaks within 360 groupings. 361 362 Returns 363 ------- 364 adata : ad.AnnData 365 AnnData with the following attributes and keys: 366 367 `adata.X` (array_like): 368 Data matrix. 369 370 `adata.var_names` (array_like): 371 Feature names corresponding to `adata.X`. 372 373 `adata.obs['labels']` (array_like): 374 cell classes/phenotypes from `cell_labels`. 375 376 `adata.uns['train_indices']` (array_like): 377 Indices for training data. 378 379 `adata.uns['test_indices']` (array_like) 380 Indices for testing data. 381 382 `adata.uns['group_dict']` (dict): 383 Grouping information. 384 385 `adata.uns['seed_obj']` (np.random._generator.Generator): 386 Seed object with seed equal to 100 * `random_state`. 387 388 `adata.uns['D']` (int): 389 Number of dimensions to scMKL with. 390 391 `adata.uns['scale_data']` (bool): 392 Whether or not data is log and z-score transformed. 393 394 `adata.uns['distance_metric']` (str): 395 Distance metric as given. 396 397 `adata.uns['kernel_type']` (str): 398 Kernel function as given. 399 400 `adata.uns['svd']` (bool): 401 Whether to calculate SVD reduction. 402 403 `adata.uns['tfidf']` (bool): 404 Whether to calculate TF-IDF per grouping. 405 406 Examples 407 -------- 408 >>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz') 409 >>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True) 410 >>> group_dict = np.load('hallmark_genesets.pkl', 411 >>> allow_pickle = True) 412 >>> 413 >>> adata = scmkl.create_adata(X = data_mat, 414 ... feature_names = gene_names, 415 ... group_dict = group_dict) 416 >>> adata 417 AnnData object with n_obs × n_vars = 1000 × 4341 418 obs: 'labels' 419 uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 420 'distance_metric', 'train_indices', 'test_indices' 421 """ 422 423 assert X.shape[1] == len(feature_names), ("Different number of features " 424 "in X than feature names") 425 426 if not allow_multiclass: 427 assert len(np.unique(cell_labels)) == 2, ("cell_labels must contain " 428 "2 classes") 429 if D is not None: 430 assert isinstance(D, int) and D > 0, 'D must be a positive integer' 431 432 kernel_options = ['gaussian', 'laplacian', 'cauchy'] 433 assert kernel_type.lower() in kernel_options, ("Given kernel type not " 434 "implemented. Gaussian, " 435 "Laplacian, and Cauchy " 436 "are the acceptable " 437 "types.") 438 439 # Create adata object and add column names 440 adata = ad.AnnData(X) 441 adata.var_names = feature_names 442 443 if isinstance(obs_names, (np.ndarray)): 444 adata.obs_names = obs_names 445 446 filtered_feature_names, group_dict = _filter_features(feature_names, 447 group_dict) 448 449 if remove_features: 450 warnings.filterwarnings('ignore', category = ad.ImplicitModificationWarning) 451 adata = adata[:, filtered_feature_names] 452 453 gc.collect() 454 455 # Add metadata to adata object 456 adata.uns['group_dict'] = group_dict 457 adata.uns['seed_obj'] = np.random.default_rng(100*random_state) 458 adata.uns['scale_data'] = scale_data 459 adata.uns['D'] = D if D is not None else calculate_d(adata.shape[0]) 460 adata.uns['kernel_type'] = kernel_type 461 adata.uns['distance_metric'] = distance_metric 462 adata.uns['reduction'] = reduction if isinstance(reduction, str) else 'None' 463 adata.uns['tfidf'] = tfidf 464 465 if (split_data is None): 466 assert X.shape[0] == len(cell_labels), ("Different number of cells " 467 "than labels") 468 adata.obs['labels'] = cell_labels 469 470 if (allow_multiclass == False): 471 split = _binary_split(cell_labels, 472 seed_obj = adata.uns['seed_obj'], 473 train_ratio = train_ratio) 474 train_indices, test_indices = split 475 476 elif (allow_multiclass == True): 477 split = _multi_class_split(cell_labels, 478 seed_obj = adata.uns['seed_obj'], 479 class_threshold = class_threshold, 480 train_ratio = train_ratio) 481 train_indices, test_indices = split 482 483 adata.uns['labeled_test'] = True 484 485 else: 486 x_eq_labs = X.shape[0] == len(cell_labels) 487 train_eq_labs = X.shape[0] == len(cell_labels) 488 assert x_eq_labs or train_eq_labs, ("Must give labels for all cells " 489 "or only for training cells") 490 491 train_indices = np.where(split_data == 'train')[0] 492 test_indices = np.where(split_data == 'test')[0] 493 494 if len(cell_labels) == len(train_indices): 495 496 padded_cell_labels = np.zeros((X.shape[0])).astype('object') 497 padded_cell_labels[train_indices] = cell_labels 498 padded_cell_labels[test_indices] = 'padded_test_label' 499 500 adata.obs['labels'] = padded_cell_labels 501 adata.uns['labeled_test'] = False 502 503 elif len(cell_labels) == len(split_data): 504 adata.obs['labels'] = cell_labels 505 adata.uns['labeled_test'] = True 506 507 # Ensuring all train samples are first in adata object followed by test 508 sort_idx, train_indices, test_indices = sort_samples(train_indices, 509 test_indices) 510 511 adata = adata[sort_idx] 512 513 if not isinstance(obs_names, (np.ndarray)): 514 adata.obs = adata.obs.reset_index(drop=True) 515 adata.obs.index = adata.obs.index.astype('O') 516 517 adata.uns['train_indices'] = train_indices 518 adata.uns['test_indices'] = test_indices 519 520 if not scale_data: 521 print("WARNING: Data will not be log transformed and scaled. " 522 "To change this behavior, set scale_data to True") 523 524 return adata
Function to create an AnnData object to carry all relevant information going forward.
Parameters
- X (scipy.sparse.csc_matrix | np.ndarray | pd.DataFrame): A data matrix of cells by features (sparse array recommended for large datasets).
- feature_names (np.ndarray):
Array of feature names corresponding with the features
in
X
. - cell_labels (np.ndarray):
A numpy array of cell phenotypes corresponding with
the cells in
X
. - group_dict (dict):
Dictionary containing feature grouping information (i.e.
{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: np.array([...]), ...}
. - obs_names (None | np.ndarray):
The cell names corresponding to
X
to be assigned to output object.obs_names
attribute. - scale_data (bool):
If
True
, data matrix is log transformed and standard scaled. - split_data (None | np.ndarray):
If
None
, data will be split stratified by cell labels. Else, is an array of precalculated train/test split corresponding to samples. Can include labels for entire dataset to benchmark performance or for only training data to classify unknown cell types (i.e.np.array(['train', 'test', ..., 'train'])
. - D (int):
Number of Random Fourier Features used to calculate Z.
Should be a positive integer. Higher values of D will
increase classification accuracy at the cost of computation
time. If set to
None
, will be calculated given number of samples. - remove_features (bool):
If
True
, will remove features fromX
andfeature_names
not ingroup_dict
and remove features from groupings not infeature_names
. - train_ratio (float):
Ratio of number of training samples to entire data set. Note:
if a threshold is applied, the ratio training samples may
decrease depending on class balance and
class_threshold
parameter ifallow_multiclass = True
. - distance_metric (str):
The pairwise distance metric used to estimate sigma. Must
be one of the options used in
scipy.spatial.distance.cdist
. - kernel_type (str):
The approximated kernel function used to calculate Zs.
Must be one of
'Gaussian'
,'Laplacian'
, or'Cauchy'
. - random_state (int): Integer random_state used to set the seed for reproducibilty.
- allow_multiclass (bool):
If
False
, will ensure that cell labels are binary. - class_threshold (str | int):
Number of samples allowed in the training data for each cell
class in the training data. If
'median'
, the median number of cells per cell class will be the threshold for number of samples per class. - reduction (str | None):
Choose which dimension reduction technique to perform on
features within a group. 'svd' will run
sklearn.decomposition.TruncatedSVD
, 'linear' will multiply by an array of 1s down to 50 dimensions. - tfidf (bool): Whether to calculate TFIDF transformation on peaks within groupings.
Returns
adata (ad.AnnData): AnnData with the following attributes and keys:
adata.X
(array_like): Data matrix.adata.var_names
(array_like): Feature names corresponding toadata.X
.adata.obs['labels']
(array_like): cell classes/phenotypes fromcell_labels
.adata.uns['train_indices']
(array_like): Indices for training data.adata.uns['test_indices']
(array_like) Indices for testing data.adata.uns['group_dict']
(dict): Grouping information.adata.uns['seed_obj']
(np.random._generator.Generator): Seed object with seed equal to 100 *random_state
.adata.uns['D']
(int): Number of dimensions to scMKL with.adata.uns['scale_data']
(bool): Whether or not data is log and z-score transformed.adata.uns['distance_metric']
(str): Distance metric as given.adata.uns['kernel_type']
(str): Kernel function as given.adata.uns['svd']
(bool): Whether to calculate SVD reduction.adata.uns['tfidf']
(bool): Whether to calculate TF-IDF per grouping.
Examples
>>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz')
>>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True)
>>> group_dict = np.load('hallmark_genesets.pkl',
>>> allow_pickle = True)
>>>
>>> adata = scmkl.create_adata(X = data_mat,
... feature_names = gene_names,
... group_dict = group_dict)
>>> adata
AnnData object with n_obs × n_vars = 1000 × 4341
obs: 'labels'
uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type',
'distance_metric', 'train_indices', 'test_indices'
527def format_adata(adata: ad.AnnData | str, cell_labels: np.ndarray | str, 528 group_dict: dict | str, use_raw: bool=False, 529 scale_data: bool=True, split_data: np.ndarray | None=None, 530 D: int | None=None, remove_features: bool=True, 531 train_ratio: float=0.8, distance_metric: str='euclidean', 532 kernel_type: str='Gaussian', random_state: int=1, 533 allow_multiclass: bool = False, 534 class_threshold: str | int = 'median', 535 reduction: str | None = None, tfidf: bool = False): 536 """ 537 Function to format an `ad.AnnData` object to carry all relevant 538 information going forward. `adata.obs_names` will be retained. 539 540 **NOTE: Information not needed for running `scmkl` will be 541 removed.** 542 543 Parameters 544 ---------- 545 adata : ad.AnnData 546 Object with data for `scmkl` to be applied to. Only requirment 547 is that `.var_names` is correct and data matrix is in `adata.X` 548 or `adata.raw.X`. A h5ad file can be provided as a `str` and it 549 will be read in. 550 551 cell_labels : np.ndarray | str 552 If type `str`, the labels for `scmkl` to learn are captured 553 from `adata.obs['cell_labels']`. Else, a `np.ndarray` of cell 554 phenotypes corresponding with the cells in `adata.X`. 555 556 group_dict : dict | str 557 Dictionary containing feature grouping information (i.e. 558 `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 559 np.array([...]), ...}`. A pickle file can be provided as a `str` 560 and it will be read in. 561 562 obs_names : None | np.ndarray 563 The cell names corresponding to `X` to be assigned to output 564 object `.obs_names` attribute. 565 566 use_raw : bool 567 If `False`, will use `adata.X` to create new `adata`. Else, 568 will use `adata.raw.X`. 569 570 scale_data : bool 571 If `True`, data matrix is log transformed and standard 572 scaled. 573 574 split_data : None | np.ndarray 575 If `None`, data will be split stratified by cell labels. 576 Else, is an array of precalculated train/test split 577 corresponding to samples. Can include labels for entire 578 dataset to benchmark performance or for only training 579 data to classify unknown cell types (i.e. `np.array(['train', 580 'test', ..., 'train'])`. 581 582 D : int 583 Number of Random Fourier Features used to calculate Z. 584 Should be a positive integer. Higher values of D will 585 increase classification accuracy at the cost of computation 586 time. If set to `None`, will be calculated given number of 587 samples. 588 589 remove_features : bool 590 If `True`, will remove features from `X` and `feature_names` 591 not in `group_dict` and remove features from groupings not in 592 `feature_names`. 593 594 train_ratio : float 595 Ratio of number of training samples to entire data set. Note: 596 if a threshold is applied, the ratio training samples may 597 decrease depending on class balance and `class_threshold` 598 parameter if `allow_multiclass = True`. 599 600 distance_metric : str 601 The pairwise distance metric used to estimate sigma. Must 602 be one of the options used in `scipy.spatial.distance.cdist`. 603 604 kernel_type : str 605 The approximated kernel function used to calculate Zs. 606 Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`. 607 608 random_state : int 609 Integer random_state used to set the seed for 610 reproducibilty. 611 612 allow_multiclass : bool 613 If `False`, will ensure that cell labels are binary. 614 615 class_threshold : str | int 616 Number of samples allowed in the training data for each cell 617 class in the training data. If `'median'`, the median number 618 of cells per cell class will be the threshold for number of 619 samples per class. 620 621 reduction: str | None 622 Choose which dimension reduction technique to perform on 623 features within a group. 'svd' will run 624 `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 625 by an array of 1s down to 50 dimensions. 626 627 tfidf: bool 628 Whether to calculate TFIDF transformation on peaks within 629 groupings. 630 631 Returns 632 ------- 633 adata : ad.AnnData 634 AnnData with the following attributes and keys: 635 636 `adata.X` (array_like): 637 Data matrix. 638 639 `adata.var_names` (array_like): 640 Feature names corresponding to `adata.X`. 641 642 `adata.obs['labels']` (array_like): 643 cell classes/phenotypes from `cell_labels`. 644 645 `adata.uns['train_indices']` (array_like): 646 Indices for training data. 647 648 `adata.uns['test_indices']` (array_like) 649 Indices for testing data. 650 651 `adata.uns['group_dict']` (dict): 652 Grouping information. 653 654 `adata.uns['seed_obj']` (np.random._generator.Generator): 655 Seed object with seed equal to 100 * `random_state`. 656 657 `adata.uns['D']` (int): 658 Number of dimensions to scMKL with. 659 660 `adata.uns['scale_data']` (bool): 661 Whether or not data is log and z-score transformed. 662 663 `adata.uns['distance_metric']` (str): 664 Distance metric as given. 665 666 `adata.uns['kernel_type']` (str): 667 Kernel function as given. 668 669 `adata.uns['svd']` (bool): 670 Whether to calculate SVD reduction. 671 672 `adata.uns['tfidf']` (bool): 673 Whether to calculate TF-IDF per grouping. 674 675 Examples 676 -------- 677 >>> adata = ad.read_h5ad('MCF7_rna.h5ad') 678 >>> group_dict = np.load('hallmark_genesets.pkl', 679 >>> allow_pickle = True) 680 >>> 681 >>> 682 >>> # The labels in adata.obs we want to learn are 'celltypes' 683 >>> adata = scmkl.format_adata(adata, 'celltypes', 684 ... group_dict) 685 >>> adata 686 AnnData object with n_obs × n_vars = 1000 × 4341 687 obs: 'labels' 688 uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 689 'distance_metric', 'train_indices', 'test_indices' 690 """ 691 if str == type(adata): 692 adata = ad.read_h5ad(adata) 693 694 if str == type(group_dict): 695 group_dict = np.load(group_dict, allow_pickle=True) 696 697 if str == type(cell_labels): 698 err_msg = f"{cell_labels} is not in `adata.obs`" 699 assert cell_labels in adata.obs.keys(), err_msg 700 cell_labels = adata.obs[cell_labels].to_numpy() 701 702 if use_raw: 703 assert adata.raw, "`adata.raw` is empty, set `use_raw` to `False`" 704 X = adata.raw.X 705 else: 706 X = adata.X 707 708 adata = create_adata(X, adata.var_names.to_numpy().copy(), cell_labels, 709 group_dict, adata.obs_names.to_numpy().copy(), 710 scale_data, split_data, D, remove_features, 711 train_ratio, distance_metric, kernel_type, 712 random_state, allow_multiclass, class_threshold, 713 reduction, tfidf) 714 715 return adata
Function to format an ad.AnnData
object to carry all relevant
information going forward. adata.obs_names
will be retained.
NOTE: Information not needed for running scmkl
will be
removed.
Parameters
- adata (ad.AnnData):
Object with data for
scmkl
to be applied to. Only requirment is that.var_names
is correct and data matrix is inadata.X
oradata.raw.X
. A h5ad file can be provided as astr
and it will be read in. - cell_labels (np.ndarray | str):
If type
str
, the labels forscmkl
to learn are captured fromadata.obs['cell_labels']
. Else, anp.ndarray
of cell phenotypes corresponding with the cells inadata.X
. - group_dict (dict | str):
Dictionary containing feature grouping information (i.e.
{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: np.array([...]), ...}
. A pickle file can be provided as astr
and it will be read in. - obs_names (None | np.ndarray):
The cell names corresponding to
X
to be assigned to output object.obs_names
attribute. - use_raw (bool):
If
False
, will useadata.X
to create newadata
. Else, will useadata.raw.X
. - scale_data (bool):
If
True
, data matrix is log transformed and standard scaled. - split_data (None | np.ndarray):
If
None
, data will be split stratified by cell labels. Else, is an array of precalculated train/test split corresponding to samples. Can include labels for entire dataset to benchmark performance or for only training data to classify unknown cell types (i.e.np.array(['train', 'test', ..., 'train'])
. - D (int):
Number of Random Fourier Features used to calculate Z.
Should be a positive integer. Higher values of D will
increase classification accuracy at the cost of computation
time. If set to
None
, will be calculated given number of samples. - remove_features (bool):
If
True
, will remove features fromX
andfeature_names
not ingroup_dict
and remove features from groupings not infeature_names
. - train_ratio (float):
Ratio of number of training samples to entire data set. Note:
if a threshold is applied, the ratio training samples may
decrease depending on class balance and
class_threshold
parameter ifallow_multiclass = True
. - distance_metric (str):
The pairwise distance metric used to estimate sigma. Must
be one of the options used in
scipy.spatial.distance.cdist
. - kernel_type (str):
The approximated kernel function used to calculate Zs.
Must be one of
'Gaussian'
,'Laplacian'
, or'Cauchy'
. - random_state (int): Integer random_state used to set the seed for reproducibilty.
- allow_multiclass (bool):
If
False
, will ensure that cell labels are binary. - class_threshold (str | int):
Number of samples allowed in the training data for each cell
class in the training data. If
'median'
, the median number of cells per cell class will be the threshold for number of samples per class. - reduction (str | None):
Choose which dimension reduction technique to perform on
features within a group. 'svd' will run
sklearn.decomposition.TruncatedSVD
, 'linear' will multiply by an array of 1s down to 50 dimensions. - tfidf (bool): Whether to calculate TFIDF transformation on peaks within groupings.
Returns
adata (ad.AnnData): AnnData with the following attributes and keys:
adata.X
(array_like): Data matrix.adata.var_names
(array_like): Feature names corresponding toadata.X
.adata.obs['labels']
(array_like): cell classes/phenotypes fromcell_labels
.adata.uns['train_indices']
(array_like): Indices for training data.adata.uns['test_indices']
(array_like) Indices for testing data.adata.uns['group_dict']
(dict): Grouping information.adata.uns['seed_obj']
(np.random._generator.Generator): Seed object with seed equal to 100 *random_state
.adata.uns['D']
(int): Number of dimensions to scMKL with.adata.uns['scale_data']
(bool): Whether or not data is log and z-score transformed.adata.uns['distance_metric']
(str): Distance metric as given.adata.uns['kernel_type']
(str): Kernel function as given.adata.uns['svd']
(bool): Whether to calculate SVD reduction.adata.uns['tfidf']
(bool): Whether to calculate TF-IDF per grouping.
Examples
>>> adata = ad.read_h5ad('MCF7_rna.h5ad')
>>> group_dict = np.load('hallmark_genesets.pkl',
>>> allow_pickle = True)
>>>
>>>
>>> # The labels in adata.obs we want to learn are 'celltypes'
>>> adata = scmkl.format_adata(adata, 'celltypes',
... group_dict)
>>> adata
AnnData object with n_obs × n_vars = 1000 × 4341
obs: 'labels'
uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type',
'distance_metric', 'train_indices', 'test_indices'