scmkl.create_adata
1import numpy as np 2import anndata as ad 3import scipy 4import pandas as pd 5import gc 6import warnings 7 8 9def filter_features(feature_names: np.ndarray, group_dict: dict): 10 """ 11 Function to remove features only in feature names or group_dict. 12 Any features not included in group_dict will be removed from the 13 matrix. Also puts the features in the same relative order (of 14 included features) 15 16 Parameters 17 ---------- 18 feature_names : np.ndarray 19 Numpy array of corresponding feature names. 20 21 group_dict : dict 22 Dictionary containing feature grouping information. 23 Example: {geneset: np.array(gene_1, gene_2, ..., 24 gene_n)} 25 Returns 26 ------- 27 feature_names : np.ndarray 28 Numpy array of corresponding feature names from group_dict. 29 30 group_dict : dict 31 Dictionary containing features overlapping input grouping 32 information and full feature names. 33 """ 34 group_features = set() 35 feature_set = set(feature_names) 36 37 # Store all objects in dictionary in set 38 for group in group_dict.keys(): 39 group_features.update(set(group_dict[group])) 40 41 # Finds intersection between group features and features in data 42 # Converts to nd.array and sorts to preserve order of feature names 43 group_feats = list(feature_set.intersection(set(group_dict[group]))) 44 group_dict[group] = np.sort(np.array(group_feats)) 45 46 # Only keeping groupings that have at least two features 47 group_dict = {group : group_dict[group] for group in group_dict.keys() 48 if len(group_dict[group]) > 1} 49 50 group_features = np.array(list(group_features.intersection(feature_set))) 51 52 return group_features, group_dict 53 54 55def multi_class_split(y: np.ndarray, seed_obj: np.random._generator.Generator, 56 class_threshold: str | int | None=None, 57 train_ratio: float=0.8): 58 """ 59 Function for calculating the training and testing cell positions 60 for multiclass data sets. 61 62 Parameters 63 ---------- 64 y : array_like 65 Should be an iterable object cooresponding to samples in 66 `ad.AnnData` object. 67 68 seed_obj : np.random._generator.Generator 69 Seed used to randomly sample and split data. 70 71 train_ratio : float 72 Ratio of number of training samples to entire data set. 73 Note: if a threshold is applied, the ratio training samples 74 may decrease depending on class balance and `class_threshold` 75 parameter. 76 77 class_threshold : str | int 78 If is type `int`, classes with more samples than 79 class_threshold will be sampled. If `'median'`, 80 samples will be sampled to the median number of samples per 81 class. 82 83 Returns 84 ------- 85 train_indices : np.ndarray 86 Indices for training samples. 87 88 test_indices : np.ndarray 89 Indices for testing samples. 90 """ 91 uniq_labels = np.unique(y) 92 93 # Finding indices for each cell class 94 class_positions = {class_ : np.where(y == class_)[0] 95 for class_ in uniq_labels} 96 97 # Capturing training indices while maintaining original class proportions 98 train_samples = {class_ : seed_obj.choice(class_positions[class_], 99 int(len(class_positions[class_]) 100 * train_ratio), 101 replace = False) 102 for class_ in class_positions.keys()} 103 104 # Capturing testing indices while maintaining original class proportions 105 test_samples = {class_ : np.setdiff1d(class_positions[class_], 106 train_samples[class_]) 107 for class_ in class_positions.keys()} 108 109 # Applying threshold for samples per class 110 if class_threshold == 'median': 111 cells_per_class = [len(values) for values in train_samples.values()] 112 class_threshold = int(np.median(cells_per_class)) 113 114 if isinstance(class_threshold, int): 115 # Down sample to class_threshold 116 for class_ in train_samples.keys(): 117 if len(train_samples[class_]) > class_threshold: 118 train_samples[class_] = seed_obj.choice(train_samples[class_], 119 class_threshold) 120 121 train_indices = np.array([idx for class_ in train_samples.keys() 122 for idx in train_samples[class_]]) 123 124 test_indices = np.array([idx for class_ in test_samples.keys() 125 for idx in test_samples[class_]]) 126 127 return train_indices, test_indices 128 129 130def binary_split(y: np.ndarray, train_indices: np.ndarray | None=None, 131 train_ratio: float=0.8, 132 seed_obj: np.random._generator.Generator=np.random.default_rng(100)): 133 """ 134 Function to calculate training and testing indices for given 135 dataset. If train indices are given, it will calculate the test 136 indices. If train_indices == None, then it calculates both indices, 137 preserving the ratio of each label in y 138 139 Parameters 140 ---------- 141 y : np.ndarray 142 Numpy array of cell labels. Can have any number of classes 143 for this function. 144 145 train_indices : np.ndarray | None 146 Optional array of pre-determined training indices 147 148 train_ratio : float 149 Decimal value ratio of features in training/testing sets 150 151 seed_obj : np.random._generator.Generator 152 Numpy random state used for random processes. Can be 153 specified for reproducubility or set by default. 154 155 156 Returns 157 ------- 158 train_indices : np.ndarray 159 Array of indices of training cells. 160 161 test_indices : np.ndarray: 162 Array of indices of testing cells. 163 """ 164 # If train indices aren't provided 165 if train_indices is None: 166 167 unique_labels = np.unique(y) 168 train_indices = [] 169 170 for label in unique_labels: 171 172 # Find indices of each unique label 173 label_indices = np.where(y == label)[0] 174 175 # Sample these indices according to train ratio 176 n = int(len(label_indices) * train_ratio) 177 train_label_indices = seed_obj.choice(label_indices, n, 178 replace = False) 179 train_indices.extend(train_label_indices) 180 else: 181 assert len(train_indices) <= len(y), ("More train indices than there " 182 "are samples") 183 184 train_indices = np.array(train_indices) 185 186 # Test indices are the indices not in the train_indices 187 test_indices = np.setdiff1d(np.arange(len(y)), train_indices, 188 assume_unique = True) 189 190 return train_indices, test_indices 191 192 193def calculate_d(num_samples : int): 194 """ 195 This function calculates the optimal number of dimensions for 196 performance. See https://doi.org/10.48550/arXiv.1806.09178 for more 197 information. 198 199 Parameters 200 ---------- 201 num_samples : int 202 The number of samples in the data set including both training 203 and testing sets. 204 205 Returns 206 ------- 207 d : int 208 The optimal number of dimensions to run scMKL with the given 209 data set. 210 211 Examples 212 -------- 213 >>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz') 214 >>> 215 >>> num_cells = raw_counts.shape[0] 216 >>> d = scmkl.calculate_d(num_cells) 217 >>> d 218 161 219 """ 220 d = int(np.sqrt(num_samples)*np.log(np.log(num_samples))) 221 222 return int(np.max([d, 100])) 223 224 225def sort_samples(train_indices, test_indices): 226 """ 227 Ensures that samples in adata obj are all training, then all 228 testing. 229 230 Parameters 231 ---------- 232 train_indices : np.ndarray 233 Indices in ad.AnnData object for training. 234 235 test_indices : np.ndarray 236 Indices in ad.AnnData object for testing. 237 238 Returns 239 ------- 240 sort_idc : np.ndarray 241 Ordered indices that will sort ad.AnnData object as all 242 training samples, then all testing. 243 244 train_indices : np.ndarray 245 The new training indices given the new index order, `sort_idc`. 246 247 test_indices : np.ndarray 248 The new testing indices given the new index order, `sort_idc`. 249 """ 250 sort_idc = np.concatenate([train_indices, test_indices]) 251 252 train_indices = np.arange(0, train_indices.shape[0]) 253 test_indices = np.arange(train_indices.shape[0], 254 train_indices.shape[0] + test_indices.shape[0]) 255 256 return sort_idc, train_indices, test_indices 257 258 259def create_adata(X: scipy.sparse._csc.csc_matrix | np.ndarray | pd.DataFrame, 260 feature_names: np.ndarray, cell_labels: np.ndarray, 261 group_dict: dict, obs_names: None | np.ndarray=None, 262 scale_data: bool=True, transform_data: bool=False, 263 split_data: np.ndarray | None=None, D: int | None=None, 264 remove_features: bool=True, train_ratio: float=0.8, 265 distance_metric: str='euclidean', kernel_type: str='Gaussian', 266 random_state: int=1, allow_multiclass: bool = False, 267 class_threshold: str | int | None = None, 268 reduction: str | None = None, tfidf: bool = False): 269 """ 270 Function to create an AnnData object to carry all relevant 271 information going forward. 272 273 Parameters 274 ---------- 275 X : scipy.sparse.csc_matrix | np.ndarray | pd.DataFrame 276 A data matrix of cells by features (sparse array 277 recommended for large datasets). 278 279 feature_names : np.ndarray 280 Array of feature names corresponding with the features 281 in `X`. 282 283 cell_labels : np.ndarray 284 A numpy array of cell phenotypes corresponding with 285 the cells in `X`. 286 287 group_dict : dict 288 Dictionary containing feature grouping information (i.e. 289 `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 290 np.array([...]), ...}`. 291 292 obs_names : None | np.ndarray 293 The cell names corresponding to `X` to be assigned to output 294 object `.obs_names` attribute. 295 296 scale_data : bool 297 If `True`, data matrix is log transformed and standard 298 scaled. Default is `True`. 299 300 transform_data : bool 301 If `True`, data will be log1p transformed (recommended for 302 counts data). Default is `False`. 303 304 split_data : None | np.ndarray 305 If `None`, data will be split stratified by cell labels. 306 Else, is an array of precalculated train/test split 307 corresponding to samples. Can include labels for entire 308 dataset to benchmark performance or for only training 309 data to classify unknown cell types (i.e. `np.array(['train', 310 'test', ..., 'train'])`. 311 312 D : int 313 Number of Random Fourier Features used to calculate Z. 314 Should be a positive integer. Higher values of D will 315 increase classification accuracy at the cost of computation 316 time. If set to `None`, will be calculated given number of 317 samples. 318 319 remove_features : bool 320 If `True`, will remove features from `X` and `feature_names` 321 not in `group_dict` and remove features from groupings not in 322 `feature_names`. 323 324 train_ratio : float 325 Ratio of number of training samples to entire data set. Note: 326 if a threshold is applied, the ratio training samples may 327 decrease depending on class balance and `class_threshold` 328 parameter if `allow_multiclass = True`. 329 330 distance_metric : str 331 The pairwise distance metric used to estimate sigma. Must 332 be one of the options used in `scipy.spatial.distance.cdist`. 333 334 kernel_type : str 335 The approximated kernel function used to calculate Zs. 336 Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`. 337 338 random_state : int 339 Integer random_state used to set the seed for 340 reproducibilty. 341 342 allow_multiclass : bool 343 If `False`, will ensure that cell labels are binary. 344 345 class_threshold : str | int 346 Number of samples allowed in the training data for each cell 347 class in the training data. If `'median'`, the median number 348 of cells per cell class will be the threshold for number of 349 samples per class. 350 351 reduction: str | None 352 Choose which dimension reduction technique to perform on 353 features within a group. 'svd' will run 354 `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 355 by an array of 1s down to 50 dimensions. 'pca' will replace 356 each group values with 50 PCs from principal component 357 analysis. 358 359 tfidf: bool 360 Whether to calculate TFIDF transformation on peaks within 361 groupings. 362 363 Returns 364 ------- 365 adata : ad.AnnData 366 AnnData with the following attributes and keys: 367 368 `adata.X` (array_like): 369 Data matrix. 370 371 `adata.var_names` (array_like): 372 Feature names corresponding to `adata.X`. 373 374 `adata.obs['labels']` (array_like): 375 cell classes/phenotypes from `cell_labels`. 376 377 `adata.uns['train_indices']` (array_like): 378 Indices for training data. 379 380 `adata.uns['test_indices']` (array_like) 381 Indices for testing data. 382 383 `adata.uns['group_dict']` (dict): 384 Grouping information. 385 386 `adata.uns['seed_obj']` (np.random._generator.Generator): 387 Seed object with seed equal to 100 * `random_state`. 388 389 `adata.uns['D']` (int): 390 Number of dimensions to scMKL with. 391 392 `adata.uns['scale_data']` (bool): 393 Whether or not data is scaled. 394 395 `adata.uns['transform_data']` (bool): 396 Whether or not data is log1p transformed. 397 398 `adata.uns['distance_metric']` (str): 399 Distance metric as given. 400 401 `adata.uns['kernel_type']` (str): 402 Kernel function as given. 403 404 `adata.uns['svd']` (bool): 405 Whether to calculate SVD reduction. 406 407 `adata.uns['tfidf']` (bool): 408 Whether to calculate TF-IDF per grouping. 409 410 Examples 411 -------- 412 >>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz') 413 >>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True) 414 >>> group_dict = np.load('hallmark_genesets.pkl', 415 >>> allow_pickle = True) 416 >>> 417 >>> adata = scmkl.create_adata(X = data_mat, 418 ... feature_names = gene_names, 419 ... group_dict = group_dict) 420 >>> adata 421 AnnData object with n_obs × n_vars = 1000 × 4341 422 obs: 'labels' 423 uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 424 'distance_metric', 'train_indices', 'test_indices' 425 """ 426 427 assert X.shape[1] == len(feature_names), ("Different number of features " 428 "in X than feature names") 429 430 if not allow_multiclass: 431 assert len(np.unique(cell_labels)) == 2, ("cell_labels must contain " 432 "2 classes") 433 if D is not None: 434 assert isinstance(D, int) and D > 0, 'D must be a positive integer' 435 436 kernel_options = ['gaussian', 'laplacian', 'cauchy'] 437 assert kernel_type.lower() in kernel_options, ("Given kernel type not " 438 "implemented. Gaussian, " 439 "Laplacian, and Cauchy " 440 "are the acceptable " 441 "types.") 442 443 # Create adata object and add column names 444 adata = ad.AnnData(X) 445 adata.var_names = feature_names 446 447 if isinstance(obs_names, (np.ndarray)): 448 adata.obs_names = obs_names 449 450 filtered_feature_names, group_dict = filter_features(feature_names, 451 group_dict) 452 453 # Ensuring that there are common features between feature_names and groups 454 overlap_err = ("No common features between feature names and grouping " 455 "dict. Check grouping.") 456 assert len(filtered_feature_names) > 0, overlap_err 457 458 if remove_features: 459 warnings.filterwarnings('ignore', category = ad.ImplicitModificationWarning) 460 adata = adata[:, filtered_feature_names] 461 462 gc.collect() 463 464 # Add metadata to adata object 465 adata.uns['group_dict'] = group_dict 466 adata.uns['seed_obj'] = np.random.default_rng(100*random_state) 467 adata.uns['scale_data'] = scale_data 468 adata.uns['transform_data'] = transform_data 469 adata.uns['D'] = D if D is not None else calculate_d(adata.shape[0]) 470 adata.uns['kernel_type'] = kernel_type 471 adata.uns['distance_metric'] = distance_metric 472 adata.uns['reduction'] = reduction if isinstance(reduction, str) else 'None' 473 adata.uns['tfidf'] = tfidf 474 475 if (split_data is None): 476 assert X.shape[0] == len(cell_labels), ("Different number of cells " 477 "than labels") 478 adata.obs['labels'] = cell_labels 479 480 if (allow_multiclass == False): 481 split = binary_split(cell_labels, 482 seed_obj = adata.uns['seed_obj'], 483 train_ratio = train_ratio) 484 train_indices, test_indices = split 485 486 elif (allow_multiclass == True): 487 split = multi_class_split(cell_labels, 488 seed_obj = adata.uns['seed_obj'], 489 class_threshold = class_threshold, 490 train_ratio = train_ratio) 491 train_indices, test_indices = split 492 493 adata.uns['labeled_test'] = True 494 495 else: 496 sd_err_message = "`split_data` argument must be of type np.ndarray" 497 assert isinstance(split_data, np.ndarray), sd_err_message 498 x_eq_labs = X.shape[0] == len(cell_labels) 499 train_eq_labs = X.shape[0] == len(cell_labels) 500 assert x_eq_labs or train_eq_labs, ("Must give labels for all cells " 501 "or only for training cells") 502 503 train_indices = np.where(split_data == 'train')[0] 504 test_indices = np.where(split_data == 'test')[0] 505 506 if len(cell_labels) == len(train_indices): 507 508 padded_cell_labels = np.zeros((X.shape[0])).astype('object') 509 padded_cell_labels[train_indices] = cell_labels 510 padded_cell_labels[test_indices] = 'padded_test_label' 511 512 adata.obs['labels'] = padded_cell_labels 513 adata.uns['labeled_test'] = False 514 515 elif len(cell_labels) == len(split_data): 516 adata.obs['labels'] = cell_labels 517 adata.uns['labeled_test'] = True 518 519 # Ensuring all train samples are first in adata object followed by test 520 sort_idx, train_indices, test_indices = sort_samples(train_indices, 521 test_indices) 522 523 adata = adata[sort_idx] 524 525 if not isinstance(obs_names, (np.ndarray)): 526 adata.obs = adata.obs.reset_index(drop=True) 527 adata.obs.index = adata.obs.index.astype('O') 528 529 adata.uns['train_indices'] = train_indices 530 adata.uns['test_indices'] = test_indices 531 532 if not scale_data: 533 print("WARNING: Data will not be scaled. " 534 "To change this behavior, set scale_data to True") 535 if not transform_data: 536 print("WARNING: Data will not be transformed." 537 "To change this behavior, set transform_data to True") 538 539 return adata 540 541 542def format_adata(adata: ad.AnnData | str, cell_labels: np.ndarray | str, 543 group_dict: dict | str, use_raw: bool=False, 544 scale_data: bool=True, transform_data: bool=False, 545 split_data: np.ndarray | None=None, D: int | None=None, 546 remove_features: bool=True, train_ratio: float=0.8, 547 distance_metric: str='euclidean', kernel_type: str='Gaussian', 548 random_state: int=1, allow_multiclass: bool = False, 549 class_threshold: str | int | None=None, 550 reduction: str | None = None, tfidf: bool = False): 551 """ 552 Function to format an `ad.AnnData` object to carry all relevant 553 information going forward. `adata.obs_names` will be retained. 554 555 **NOTE: Information not needed for running `scmkl` will be 556 removed.** 557 558 Parameters 559 ---------- 560 adata : ad.AnnData 561 Object with data for `scmkl` to be applied to. Only requirment 562 is that `.var_names` is correct and data matrix is in `adata.X` 563 or `adata.raw.X`. A h5ad file can be provided as a `str` and it 564 will be read in. 565 566 cell_labels : np.ndarray | str 567 If type `str`, the labels for `scmkl` to learn are captured 568 from `adata.obs['cell_labels']`. Else, a `np.ndarray` of cell 569 phenotypes corresponding with the cells in `adata.X`. 570 571 group_dict : dict | str 572 Dictionary containing feature grouping information (i.e. 573 `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 574 np.array([...]), ...}`. A pickle file can be provided as a `str` 575 and it will be read in. 576 577 obs_names : None | np.ndarray 578 The cell names corresponding to `X` to be assigned to output 579 object `.obs_names` attribute. 580 581 use_raw : bool 582 If `False`, will use `adata.X` to create new `adata`. Else, 583 will use `adata.raw.X`. 584 585 scale_data : bool 586 If `True`, data matrix is log transformed and standard 587 scaled. 588 589 transform_data : bool 590 If `True`, data will be log1p transformed (recommended for 591 counts data). Default is `False`. 592 593 split_data : None | np.ndarray 594 If `None`, data will be split stratified by cell labels. 595 Else, is an array of precalculated train/test split 596 corresponding to samples. Can include labels for entire 597 dataset to benchmark performance or for only training 598 data to classify unknown cell types (i.e. `np.array(['train', 599 'test', ..., 'train'])`. 600 601 D : int 602 Number of Random Fourier Features used to calculate Z. 603 Should be a positive integer. Higher values of D will 604 increase classification accuracy at the cost of computation 605 time. If set to `None`, will be calculated given number of 606 samples. 607 608 remove_features : bool 609 If `True`, will remove features from `X` and `feature_names` 610 not in `group_dict` and remove features from groupings not in 611 `feature_names`. 612 613 train_ratio : float 614 Ratio of number of training samples to entire data set. Note: 615 if a threshold is applied, the ratio training samples may 616 decrease depending on class balance and `class_threshold` 617 parameter if `allow_multiclass = True`. 618 619 distance_metric : str 620 The pairwise distance metric used to estimate sigma. Must 621 be one of the options used in `scipy.spatial.distance.cdist`. 622 623 kernel_type : str 624 The approximated kernel function used to calculate Zs. 625 Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`. 626 627 random_state : int 628 Integer random_state used to set the seed for 629 reproducibilty. 630 631 allow_multiclass : bool 632 If `False`, will ensure that cell labels are binary. 633 634 class_threshold : str | int 635 Number of samples allowed in the training data for each cell 636 class in the training data. If `'median'`, the median number 637 of cells per cell class will be the threshold for number of 638 samples per class. 639 640 reduction: str | None 641 Choose which dimension reduction technique to perform on 642 features within a group. 'svd' will run 643 `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 644 by an array of 1s down to 50 dimensions. 645 646 tfidf: bool 647 Whether to calculate TFIDF transformation on peaks within 648 groupings. 649 650 Returns 651 ------- 652 adata : ad.AnnData 653 AnnData with the following attributes and keys: 654 655 `adata.X` (array_like): 656 Data matrix. 657 658 `adata.var_names` (array_like): 659 Feature names corresponding to `adata.X`. 660 661 `adata.obs['labels']` (array_like): 662 cell classes/phenotypes from `cell_labels`. 663 664 `adata.uns['train_indices']` (array_like): 665 Indices for training data. 666 667 `adata.uns['test_indices']` (array_like) 668 Indices for testing data. 669 670 `adata.uns['group_dict']` (dict): 671 Grouping information. 672 673 `adata.uns['seed_obj']` (np.random._generator.Generator): 674 Seed object with seed equal to 100 * `random_state`. 675 676 `adata.uns['D']` (int): 677 Number of dimensions to scMKL with. 678 679 `adata.uns['scale_data']` (bool): 680 Whether or not data is scaled. 681 682 `adata.uns['transform_data']` (bool): 683 Whether or not data is log1p transformed. 684 685 `adata.uns['distance_metric']` (str): 686 Distance metric as given. 687 688 `adata.uns['kernel_type']` (str): 689 Kernel function as given. 690 691 `adata.uns['svd']` (bool): 692 Whether to calculate SVD reduction. 693 694 `adata.uns['tfidf']` (bool): 695 Whether to calculate TF-IDF per grouping. 696 697 Examples 698 -------- 699 >>> adata = ad.read_h5ad('MCF7_rna.h5ad') 700 >>> group_dict = np.load('hallmark_genesets.pkl', 701 >>> allow_pickle = True) 702 >>> 703 >>> 704 >>> # The labels in adata.obs we want to learn are 'celltypes' 705 >>> adata = scmkl.format_adata(adata, 'celltypes', 706 ... group_dict) 707 >>> adata 708 AnnData object with n_obs × n_vars = 1000 × 4341 709 obs: 'labels' 710 uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 711 'distance_metric', 'train_indices', 'test_indices' 712 """ 713 if str == type(adata): 714 adata = ad.read_h5ad(adata) 715 716 if str == type(group_dict): 717 group_dict = np.load(group_dict, allow_pickle=True) 718 719 if str == type(cell_labels): 720 err_msg = f"{cell_labels} is not in `adata.obs`" 721 assert cell_labels in adata.obs.keys(), err_msg 722 cell_labels = adata.obs[cell_labels].to_numpy() 723 724 if use_raw: 725 assert adata.raw, "`adata.raw` is empty, set `use_raw` to `False`" 726 X = adata.raw.X 727 else: 728 X = adata.X 729 730 adata = create_adata(X, adata.var_names.to_numpy().copy(), cell_labels, 731 group_dict, adata.obs_names.to_numpy().copy(), 732 scale_data, transform_data, split_data, D, remove_features, 733 train_ratio, distance_metric, kernel_type, 734 random_state, allow_multiclass, class_threshold, 735 reduction, tfidf) 736 737 return adata
10def filter_features(feature_names: np.ndarray, group_dict: dict): 11 """ 12 Function to remove features only in feature names or group_dict. 13 Any features not included in group_dict will be removed from the 14 matrix. Also puts the features in the same relative order (of 15 included features) 16 17 Parameters 18 ---------- 19 feature_names : np.ndarray 20 Numpy array of corresponding feature names. 21 22 group_dict : dict 23 Dictionary containing feature grouping information. 24 Example: {geneset: np.array(gene_1, gene_2, ..., 25 gene_n)} 26 Returns 27 ------- 28 feature_names : np.ndarray 29 Numpy array of corresponding feature names from group_dict. 30 31 group_dict : dict 32 Dictionary containing features overlapping input grouping 33 information and full feature names. 34 """ 35 group_features = set() 36 feature_set = set(feature_names) 37 38 # Store all objects in dictionary in set 39 for group in group_dict.keys(): 40 group_features.update(set(group_dict[group])) 41 42 # Finds intersection between group features and features in data 43 # Converts to nd.array and sorts to preserve order of feature names 44 group_feats = list(feature_set.intersection(set(group_dict[group]))) 45 group_dict[group] = np.sort(np.array(group_feats)) 46 47 # Only keeping groupings that have at least two features 48 group_dict = {group : group_dict[group] for group in group_dict.keys() 49 if len(group_dict[group]) > 1} 50 51 group_features = np.array(list(group_features.intersection(feature_set))) 52 53 return group_features, group_dict
Function to remove features only in feature names or group_dict. Any features not included in group_dict will be removed from the matrix. Also puts the features in the same relative order (of included features)
Parameters
- feature_names (np.ndarray): Numpy array of corresponding feature names.
- group_dict (dict): Dictionary containing feature grouping information. Example: {geneset: np.array(gene_1, gene_2, ..., gene_n)}
Returns
- feature_names (np.ndarray): Numpy array of corresponding feature names from group_dict.
- group_dict (dict): Dictionary containing features overlapping input grouping information and full feature names.
56def multi_class_split(y: np.ndarray, seed_obj: np.random._generator.Generator, 57 class_threshold: str | int | None=None, 58 train_ratio: float=0.8): 59 """ 60 Function for calculating the training and testing cell positions 61 for multiclass data sets. 62 63 Parameters 64 ---------- 65 y : array_like 66 Should be an iterable object cooresponding to samples in 67 `ad.AnnData` object. 68 69 seed_obj : np.random._generator.Generator 70 Seed used to randomly sample and split data. 71 72 train_ratio : float 73 Ratio of number of training samples to entire data set. 74 Note: if a threshold is applied, the ratio training samples 75 may decrease depending on class balance and `class_threshold` 76 parameter. 77 78 class_threshold : str | int 79 If is type `int`, classes with more samples than 80 class_threshold will be sampled. If `'median'`, 81 samples will be sampled to the median number of samples per 82 class. 83 84 Returns 85 ------- 86 train_indices : np.ndarray 87 Indices for training samples. 88 89 test_indices : np.ndarray 90 Indices for testing samples. 91 """ 92 uniq_labels = np.unique(y) 93 94 # Finding indices for each cell class 95 class_positions = {class_ : np.where(y == class_)[0] 96 for class_ in uniq_labels} 97 98 # Capturing training indices while maintaining original class proportions 99 train_samples = {class_ : seed_obj.choice(class_positions[class_], 100 int(len(class_positions[class_]) 101 * train_ratio), 102 replace = False) 103 for class_ in class_positions.keys()} 104 105 # Capturing testing indices while maintaining original class proportions 106 test_samples = {class_ : np.setdiff1d(class_positions[class_], 107 train_samples[class_]) 108 for class_ in class_positions.keys()} 109 110 # Applying threshold for samples per class 111 if class_threshold == 'median': 112 cells_per_class = [len(values) for values in train_samples.values()] 113 class_threshold = int(np.median(cells_per_class)) 114 115 if isinstance(class_threshold, int): 116 # Down sample to class_threshold 117 for class_ in train_samples.keys(): 118 if len(train_samples[class_]) > class_threshold: 119 train_samples[class_] = seed_obj.choice(train_samples[class_], 120 class_threshold) 121 122 train_indices = np.array([idx for class_ in train_samples.keys() 123 for idx in train_samples[class_]]) 124 125 test_indices = np.array([idx for class_ in test_samples.keys() 126 for idx in test_samples[class_]]) 127 128 return train_indices, test_indices
Function for calculating the training and testing cell positions for multiclass data sets.
Parameters
- y (array_like):
Should be an iterable object cooresponding to samples in
ad.AnnDataobject. - seed_obj (np.random._generator.Generator): Seed used to randomly sample and split data.
- train_ratio (float):
Ratio of number of training samples to entire data set.
Note: if a threshold is applied, the ratio training samples
may decrease depending on class balance and
class_thresholdparameter. - class_threshold (str | int):
If is type
int, classes with more samples than class_threshold will be sampled. If'median', samples will be sampled to the median number of samples per class.
Returns
- train_indices (np.ndarray): Indices for training samples.
- test_indices (np.ndarray): Indices for testing samples.
131def binary_split(y: np.ndarray, train_indices: np.ndarray | None=None, 132 train_ratio: float=0.8, 133 seed_obj: np.random._generator.Generator=np.random.default_rng(100)): 134 """ 135 Function to calculate training and testing indices for given 136 dataset. If train indices are given, it will calculate the test 137 indices. If train_indices == None, then it calculates both indices, 138 preserving the ratio of each label in y 139 140 Parameters 141 ---------- 142 y : np.ndarray 143 Numpy array of cell labels. Can have any number of classes 144 for this function. 145 146 train_indices : np.ndarray | None 147 Optional array of pre-determined training indices 148 149 train_ratio : float 150 Decimal value ratio of features in training/testing sets 151 152 seed_obj : np.random._generator.Generator 153 Numpy random state used for random processes. Can be 154 specified for reproducubility or set by default. 155 156 157 Returns 158 ------- 159 train_indices : np.ndarray 160 Array of indices of training cells. 161 162 test_indices : np.ndarray: 163 Array of indices of testing cells. 164 """ 165 # If train indices aren't provided 166 if train_indices is None: 167 168 unique_labels = np.unique(y) 169 train_indices = [] 170 171 for label in unique_labels: 172 173 # Find indices of each unique label 174 label_indices = np.where(y == label)[0] 175 176 # Sample these indices according to train ratio 177 n = int(len(label_indices) * train_ratio) 178 train_label_indices = seed_obj.choice(label_indices, n, 179 replace = False) 180 train_indices.extend(train_label_indices) 181 else: 182 assert len(train_indices) <= len(y), ("More train indices than there " 183 "are samples") 184 185 train_indices = np.array(train_indices) 186 187 # Test indices are the indices not in the train_indices 188 test_indices = np.setdiff1d(np.arange(len(y)), train_indices, 189 assume_unique = True) 190 191 return train_indices, test_indices
Function to calculate training and testing indices for given dataset. If train indices are given, it will calculate the test indices. If train_indices == None, then it calculates both indices, preserving the ratio of each label in y
Parameters
- y (np.ndarray): Numpy array of cell labels. Can have any number of classes for this function.
- train_indices (np.ndarray | None): Optional array of pre-determined training indices
- train_ratio (float): Decimal value ratio of features in training/testing sets
- seed_obj (np.random._generator.Generator): Numpy random state used for random processes. Can be specified for reproducubility or set by default.
Returns
- train_indices (np.ndarray): Array of indices of training cells.
- test_indices (np.ndarray:): Array of indices of testing cells.
194def calculate_d(num_samples : int): 195 """ 196 This function calculates the optimal number of dimensions for 197 performance. See https://doi.org/10.48550/arXiv.1806.09178 for more 198 information. 199 200 Parameters 201 ---------- 202 num_samples : int 203 The number of samples in the data set including both training 204 and testing sets. 205 206 Returns 207 ------- 208 d : int 209 The optimal number of dimensions to run scMKL with the given 210 data set. 211 212 Examples 213 -------- 214 >>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz') 215 >>> 216 >>> num_cells = raw_counts.shape[0] 217 >>> d = scmkl.calculate_d(num_cells) 218 >>> d 219 161 220 """ 221 d = int(np.sqrt(num_samples)*np.log(np.log(num_samples))) 222 223 return int(np.max([d, 100]))
This function calculates the optimal number of dimensions for performance. See https://doi.org/10.48550/arXiv.1806.09178 for more information.
Parameters
- num_samples (int): The number of samples in the data set including both training and testing sets.
Returns
- d (int): The optimal number of dimensions to run scMKL with the given data set.
Examples
>>> raw_counts = scipy.sparse.load_npz('MCF7_counts.npz')
>>>
>>> num_cells = raw_counts.shape[0]
>>> d = scmkl.calculate_d(num_cells)
>>> d
161
226def sort_samples(train_indices, test_indices): 227 """ 228 Ensures that samples in adata obj are all training, then all 229 testing. 230 231 Parameters 232 ---------- 233 train_indices : np.ndarray 234 Indices in ad.AnnData object for training. 235 236 test_indices : np.ndarray 237 Indices in ad.AnnData object for testing. 238 239 Returns 240 ------- 241 sort_idc : np.ndarray 242 Ordered indices that will sort ad.AnnData object as all 243 training samples, then all testing. 244 245 train_indices : np.ndarray 246 The new training indices given the new index order, `sort_idc`. 247 248 test_indices : np.ndarray 249 The new testing indices given the new index order, `sort_idc`. 250 """ 251 sort_idc = np.concatenate([train_indices, test_indices]) 252 253 train_indices = np.arange(0, train_indices.shape[0]) 254 test_indices = np.arange(train_indices.shape[0], 255 train_indices.shape[0] + test_indices.shape[0]) 256 257 return sort_idc, train_indices, test_indices
Ensures that samples in adata obj are all training, then all testing.
Parameters
- train_indices (np.ndarray): Indices in ad.AnnData object for training.
- test_indices (np.ndarray): Indices in ad.AnnData object for testing.
Returns
- sort_idc (np.ndarray): Ordered indices that will sort ad.AnnData object as all training samples, then all testing.
- train_indices (np.ndarray):
The new training indices given the new index order,
sort_idc. - test_indices (np.ndarray):
The new testing indices given the new index order,
sort_idc.
260def create_adata(X: scipy.sparse._csc.csc_matrix | np.ndarray | pd.DataFrame, 261 feature_names: np.ndarray, cell_labels: np.ndarray, 262 group_dict: dict, obs_names: None | np.ndarray=None, 263 scale_data: bool=True, transform_data: bool=False, 264 split_data: np.ndarray | None=None, D: int | None=None, 265 remove_features: bool=True, train_ratio: float=0.8, 266 distance_metric: str='euclidean', kernel_type: str='Gaussian', 267 random_state: int=1, allow_multiclass: bool = False, 268 class_threshold: str | int | None = None, 269 reduction: str | None = None, tfidf: bool = False): 270 """ 271 Function to create an AnnData object to carry all relevant 272 information going forward. 273 274 Parameters 275 ---------- 276 X : scipy.sparse.csc_matrix | np.ndarray | pd.DataFrame 277 A data matrix of cells by features (sparse array 278 recommended for large datasets). 279 280 feature_names : np.ndarray 281 Array of feature names corresponding with the features 282 in `X`. 283 284 cell_labels : np.ndarray 285 A numpy array of cell phenotypes corresponding with 286 the cells in `X`. 287 288 group_dict : dict 289 Dictionary containing feature grouping information (i.e. 290 `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 291 np.array([...]), ...}`. 292 293 obs_names : None | np.ndarray 294 The cell names corresponding to `X` to be assigned to output 295 object `.obs_names` attribute. 296 297 scale_data : bool 298 If `True`, data matrix is log transformed and standard 299 scaled. Default is `True`. 300 301 transform_data : bool 302 If `True`, data will be log1p transformed (recommended for 303 counts data). Default is `False`. 304 305 split_data : None | np.ndarray 306 If `None`, data will be split stratified by cell labels. 307 Else, is an array of precalculated train/test split 308 corresponding to samples. Can include labels for entire 309 dataset to benchmark performance or for only training 310 data to classify unknown cell types (i.e. `np.array(['train', 311 'test', ..., 'train'])`. 312 313 D : int 314 Number of Random Fourier Features used to calculate Z. 315 Should be a positive integer. Higher values of D will 316 increase classification accuracy at the cost of computation 317 time. If set to `None`, will be calculated given number of 318 samples. 319 320 remove_features : bool 321 If `True`, will remove features from `X` and `feature_names` 322 not in `group_dict` and remove features from groupings not in 323 `feature_names`. 324 325 train_ratio : float 326 Ratio of number of training samples to entire data set. Note: 327 if a threshold is applied, the ratio training samples may 328 decrease depending on class balance and `class_threshold` 329 parameter if `allow_multiclass = True`. 330 331 distance_metric : str 332 The pairwise distance metric used to estimate sigma. Must 333 be one of the options used in `scipy.spatial.distance.cdist`. 334 335 kernel_type : str 336 The approximated kernel function used to calculate Zs. 337 Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`. 338 339 random_state : int 340 Integer random_state used to set the seed for 341 reproducibilty. 342 343 allow_multiclass : bool 344 If `False`, will ensure that cell labels are binary. 345 346 class_threshold : str | int 347 Number of samples allowed in the training data for each cell 348 class in the training data. If `'median'`, the median number 349 of cells per cell class will be the threshold for number of 350 samples per class. 351 352 reduction: str | None 353 Choose which dimension reduction technique to perform on 354 features within a group. 'svd' will run 355 `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 356 by an array of 1s down to 50 dimensions. 'pca' will replace 357 each group values with 50 PCs from principal component 358 analysis. 359 360 tfidf: bool 361 Whether to calculate TFIDF transformation on peaks within 362 groupings. 363 364 Returns 365 ------- 366 adata : ad.AnnData 367 AnnData with the following attributes and keys: 368 369 `adata.X` (array_like): 370 Data matrix. 371 372 `adata.var_names` (array_like): 373 Feature names corresponding to `adata.X`. 374 375 `adata.obs['labels']` (array_like): 376 cell classes/phenotypes from `cell_labels`. 377 378 `adata.uns['train_indices']` (array_like): 379 Indices for training data. 380 381 `adata.uns['test_indices']` (array_like) 382 Indices for testing data. 383 384 `adata.uns['group_dict']` (dict): 385 Grouping information. 386 387 `adata.uns['seed_obj']` (np.random._generator.Generator): 388 Seed object with seed equal to 100 * `random_state`. 389 390 `adata.uns['D']` (int): 391 Number of dimensions to scMKL with. 392 393 `adata.uns['scale_data']` (bool): 394 Whether or not data is scaled. 395 396 `adata.uns['transform_data']` (bool): 397 Whether or not data is log1p transformed. 398 399 `adata.uns['distance_metric']` (str): 400 Distance metric as given. 401 402 `adata.uns['kernel_type']` (str): 403 Kernel function as given. 404 405 `adata.uns['svd']` (bool): 406 Whether to calculate SVD reduction. 407 408 `adata.uns['tfidf']` (bool): 409 Whether to calculate TF-IDF per grouping. 410 411 Examples 412 -------- 413 >>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz') 414 >>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True) 415 >>> group_dict = np.load('hallmark_genesets.pkl', 416 >>> allow_pickle = True) 417 >>> 418 >>> adata = scmkl.create_adata(X = data_mat, 419 ... feature_names = gene_names, 420 ... group_dict = group_dict) 421 >>> adata 422 AnnData object with n_obs × n_vars = 1000 × 4341 423 obs: 'labels' 424 uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 425 'distance_metric', 'train_indices', 'test_indices' 426 """ 427 428 assert X.shape[1] == len(feature_names), ("Different number of features " 429 "in X than feature names") 430 431 if not allow_multiclass: 432 assert len(np.unique(cell_labels)) == 2, ("cell_labels must contain " 433 "2 classes") 434 if D is not None: 435 assert isinstance(D, int) and D > 0, 'D must be a positive integer' 436 437 kernel_options = ['gaussian', 'laplacian', 'cauchy'] 438 assert kernel_type.lower() in kernel_options, ("Given kernel type not " 439 "implemented. Gaussian, " 440 "Laplacian, and Cauchy " 441 "are the acceptable " 442 "types.") 443 444 # Create adata object and add column names 445 adata = ad.AnnData(X) 446 adata.var_names = feature_names 447 448 if isinstance(obs_names, (np.ndarray)): 449 adata.obs_names = obs_names 450 451 filtered_feature_names, group_dict = filter_features(feature_names, 452 group_dict) 453 454 # Ensuring that there are common features between feature_names and groups 455 overlap_err = ("No common features between feature names and grouping " 456 "dict. Check grouping.") 457 assert len(filtered_feature_names) > 0, overlap_err 458 459 if remove_features: 460 warnings.filterwarnings('ignore', category = ad.ImplicitModificationWarning) 461 adata = adata[:, filtered_feature_names] 462 463 gc.collect() 464 465 # Add metadata to adata object 466 adata.uns['group_dict'] = group_dict 467 adata.uns['seed_obj'] = np.random.default_rng(100*random_state) 468 adata.uns['scale_data'] = scale_data 469 adata.uns['transform_data'] = transform_data 470 adata.uns['D'] = D if D is not None else calculate_d(adata.shape[0]) 471 adata.uns['kernel_type'] = kernel_type 472 adata.uns['distance_metric'] = distance_metric 473 adata.uns['reduction'] = reduction if isinstance(reduction, str) else 'None' 474 adata.uns['tfidf'] = tfidf 475 476 if (split_data is None): 477 assert X.shape[0] == len(cell_labels), ("Different number of cells " 478 "than labels") 479 adata.obs['labels'] = cell_labels 480 481 if (allow_multiclass == False): 482 split = binary_split(cell_labels, 483 seed_obj = adata.uns['seed_obj'], 484 train_ratio = train_ratio) 485 train_indices, test_indices = split 486 487 elif (allow_multiclass == True): 488 split = multi_class_split(cell_labels, 489 seed_obj = adata.uns['seed_obj'], 490 class_threshold = class_threshold, 491 train_ratio = train_ratio) 492 train_indices, test_indices = split 493 494 adata.uns['labeled_test'] = True 495 496 else: 497 sd_err_message = "`split_data` argument must be of type np.ndarray" 498 assert isinstance(split_data, np.ndarray), sd_err_message 499 x_eq_labs = X.shape[0] == len(cell_labels) 500 train_eq_labs = X.shape[0] == len(cell_labels) 501 assert x_eq_labs or train_eq_labs, ("Must give labels for all cells " 502 "or only for training cells") 503 504 train_indices = np.where(split_data == 'train')[0] 505 test_indices = np.where(split_data == 'test')[0] 506 507 if len(cell_labels) == len(train_indices): 508 509 padded_cell_labels = np.zeros((X.shape[0])).astype('object') 510 padded_cell_labels[train_indices] = cell_labels 511 padded_cell_labels[test_indices] = 'padded_test_label' 512 513 adata.obs['labels'] = padded_cell_labels 514 adata.uns['labeled_test'] = False 515 516 elif len(cell_labels) == len(split_data): 517 adata.obs['labels'] = cell_labels 518 adata.uns['labeled_test'] = True 519 520 # Ensuring all train samples are first in adata object followed by test 521 sort_idx, train_indices, test_indices = sort_samples(train_indices, 522 test_indices) 523 524 adata = adata[sort_idx] 525 526 if not isinstance(obs_names, (np.ndarray)): 527 adata.obs = adata.obs.reset_index(drop=True) 528 adata.obs.index = adata.obs.index.astype('O') 529 530 adata.uns['train_indices'] = train_indices 531 adata.uns['test_indices'] = test_indices 532 533 if not scale_data: 534 print("WARNING: Data will not be scaled. " 535 "To change this behavior, set scale_data to True") 536 if not transform_data: 537 print("WARNING: Data will not be transformed." 538 "To change this behavior, set transform_data to True") 539 540 return adata
Function to create an AnnData object to carry all relevant information going forward.
Parameters
- X (scipy.sparse.csc_matrix | np.ndarray | pd.DataFrame): A data matrix of cells by features (sparse array recommended for large datasets).
- feature_names (np.ndarray):
Array of feature names corresponding with the features
in
X. - cell_labels (np.ndarray):
A numpy array of cell phenotypes corresponding with
the cells in
X. - group_dict (dict):
Dictionary containing feature grouping information (i.e.
{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: np.array([...]), ...}. - obs_names (None | np.ndarray):
The cell names corresponding to
Xto be assigned to output object.obs_namesattribute. - scale_data (bool):
If
True, data matrix is log transformed and standard scaled. Default isTrue. - transform_data (bool):
If
True, data will be log1p transformed (recommended for counts data). Default isFalse. - split_data (None | np.ndarray):
If
None, data will be split stratified by cell labels. Else, is an array of precalculated train/test split corresponding to samples. Can include labels for entire dataset to benchmark performance or for only training data to classify unknown cell types (i.e.np.array(['train', 'test', ..., 'train']). - D (int):
Number of Random Fourier Features used to calculate Z.
Should be a positive integer. Higher values of D will
increase classification accuracy at the cost of computation
time. If set to
None, will be calculated given number of samples. - remove_features (bool):
If
True, will remove features fromXandfeature_namesnot ingroup_dictand remove features from groupings not infeature_names. - train_ratio (float):
Ratio of number of training samples to entire data set. Note:
if a threshold is applied, the ratio training samples may
decrease depending on class balance and
class_thresholdparameter ifallow_multiclass = True. - distance_metric (str):
The pairwise distance metric used to estimate sigma. Must
be one of the options used in
scipy.spatial.distance.cdist. - kernel_type (str):
The approximated kernel function used to calculate Zs.
Must be one of
'Gaussian','Laplacian', or'Cauchy'. - random_state (int): Integer random_state used to set the seed for reproducibilty.
- allow_multiclass (bool):
If
False, will ensure that cell labels are binary. - class_threshold (str | int):
Number of samples allowed in the training data for each cell
class in the training data. If
'median', the median number of cells per cell class will be the threshold for number of samples per class. - reduction (str | None):
Choose which dimension reduction technique to perform on
features within a group. 'svd' will run
sklearn.decomposition.TruncatedSVD, 'linear' will multiply by an array of 1s down to 50 dimensions. 'pca' will replace each group values with 50 PCs from principal component analysis. - tfidf (bool): Whether to calculate TFIDF transformation on peaks within groupings.
Returns
adata (ad.AnnData): AnnData with the following attributes and keys:
adata.X(array_like): Data matrix.adata.var_names(array_like): Feature names corresponding toadata.X.adata.obs['labels'](array_like): cell classes/phenotypes fromcell_labels.adata.uns['train_indices'](array_like): Indices for training data.adata.uns['test_indices'](array_like) Indices for testing data.adata.uns['group_dict'](dict): Grouping information.adata.uns['seed_obj'](np.random._generator.Generator): Seed object with seed equal to 100 *random_state.adata.uns['D'](int): Number of dimensions to scMKL with.adata.uns['scale_data'](bool): Whether or not data is scaled.adata.uns['transform_data'](bool): Whether or not data is log1p transformed.adata.uns['distance_metric'](str): Distance metric as given.adata.uns['kernel_type'](str): Kernel function as given.adata.uns['svd'](bool): Whether to calculate SVD reduction.adata.uns['tfidf'](bool): Whether to calculate TF-IDF per grouping.
Examples
>>> data_mat = scipy.sparse.load_npz('MCF7_RNA_matrix.npz')
>>> gene_names = np.load('MCF7_gene_names.pkl', allow_pickle = True)
>>> group_dict = np.load('hallmark_genesets.pkl',
>>> allow_pickle = True)
>>>
>>> adata = scmkl.create_adata(X = data_mat,
... feature_names = gene_names,
... group_dict = group_dict)
>>> adata
AnnData object with n_obs × n_vars = 1000 × 4341
obs: 'labels'
uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type',
'distance_metric', 'train_indices', 'test_indices'
543def format_adata(adata: ad.AnnData | str, cell_labels: np.ndarray | str, 544 group_dict: dict | str, use_raw: bool=False, 545 scale_data: bool=True, transform_data: bool=False, 546 split_data: np.ndarray | None=None, D: int | None=None, 547 remove_features: bool=True, train_ratio: float=0.8, 548 distance_metric: str='euclidean', kernel_type: str='Gaussian', 549 random_state: int=1, allow_multiclass: bool = False, 550 class_threshold: str | int | None=None, 551 reduction: str | None = None, tfidf: bool = False): 552 """ 553 Function to format an `ad.AnnData` object to carry all relevant 554 information going forward. `adata.obs_names` will be retained. 555 556 **NOTE: Information not needed for running `scmkl` will be 557 removed.** 558 559 Parameters 560 ---------- 561 adata : ad.AnnData 562 Object with data for `scmkl` to be applied to. Only requirment 563 is that `.var_names` is correct and data matrix is in `adata.X` 564 or `adata.raw.X`. A h5ad file can be provided as a `str` and it 565 will be read in. 566 567 cell_labels : np.ndarray | str 568 If type `str`, the labels for `scmkl` to learn are captured 569 from `adata.obs['cell_labels']`. Else, a `np.ndarray` of cell 570 phenotypes corresponding with the cells in `adata.X`. 571 572 group_dict : dict | str 573 Dictionary containing feature grouping information (i.e. 574 `{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: 575 np.array([...]), ...}`. A pickle file can be provided as a `str` 576 and it will be read in. 577 578 obs_names : None | np.ndarray 579 The cell names corresponding to `X` to be assigned to output 580 object `.obs_names` attribute. 581 582 use_raw : bool 583 If `False`, will use `adata.X` to create new `adata`. Else, 584 will use `adata.raw.X`. 585 586 scale_data : bool 587 If `True`, data matrix is log transformed and standard 588 scaled. 589 590 transform_data : bool 591 If `True`, data will be log1p transformed (recommended for 592 counts data). Default is `False`. 593 594 split_data : None | np.ndarray 595 If `None`, data will be split stratified by cell labels. 596 Else, is an array of precalculated train/test split 597 corresponding to samples. Can include labels for entire 598 dataset to benchmark performance or for only training 599 data to classify unknown cell types (i.e. `np.array(['train', 600 'test', ..., 'train'])`. 601 602 D : int 603 Number of Random Fourier Features used to calculate Z. 604 Should be a positive integer. Higher values of D will 605 increase classification accuracy at the cost of computation 606 time. If set to `None`, will be calculated given number of 607 samples. 608 609 remove_features : bool 610 If `True`, will remove features from `X` and `feature_names` 611 not in `group_dict` and remove features from groupings not in 612 `feature_names`. 613 614 train_ratio : float 615 Ratio of number of training samples to entire data set. Note: 616 if a threshold is applied, the ratio training samples may 617 decrease depending on class balance and `class_threshold` 618 parameter if `allow_multiclass = True`. 619 620 distance_metric : str 621 The pairwise distance metric used to estimate sigma. Must 622 be one of the options used in `scipy.spatial.distance.cdist`. 623 624 kernel_type : str 625 The approximated kernel function used to calculate Zs. 626 Must be one of `'Gaussian'`, `'Laplacian'`, or `'Cauchy'`. 627 628 random_state : int 629 Integer random_state used to set the seed for 630 reproducibilty. 631 632 allow_multiclass : bool 633 If `False`, will ensure that cell labels are binary. 634 635 class_threshold : str | int 636 Number of samples allowed in the training data for each cell 637 class in the training data. If `'median'`, the median number 638 of cells per cell class will be the threshold for number of 639 samples per class. 640 641 reduction: str | None 642 Choose which dimension reduction technique to perform on 643 features within a group. 'svd' will run 644 `sklearn.decomposition.TruncatedSVD`, 'linear' will multiply 645 by an array of 1s down to 50 dimensions. 646 647 tfidf: bool 648 Whether to calculate TFIDF transformation on peaks within 649 groupings. 650 651 Returns 652 ------- 653 adata : ad.AnnData 654 AnnData with the following attributes and keys: 655 656 `adata.X` (array_like): 657 Data matrix. 658 659 `adata.var_names` (array_like): 660 Feature names corresponding to `adata.X`. 661 662 `adata.obs['labels']` (array_like): 663 cell classes/phenotypes from `cell_labels`. 664 665 `adata.uns['train_indices']` (array_like): 666 Indices for training data. 667 668 `adata.uns['test_indices']` (array_like) 669 Indices for testing data. 670 671 `adata.uns['group_dict']` (dict): 672 Grouping information. 673 674 `adata.uns['seed_obj']` (np.random._generator.Generator): 675 Seed object with seed equal to 100 * `random_state`. 676 677 `adata.uns['D']` (int): 678 Number of dimensions to scMKL with. 679 680 `adata.uns['scale_data']` (bool): 681 Whether or not data is scaled. 682 683 `adata.uns['transform_data']` (bool): 684 Whether or not data is log1p transformed. 685 686 `adata.uns['distance_metric']` (str): 687 Distance metric as given. 688 689 `adata.uns['kernel_type']` (str): 690 Kernel function as given. 691 692 `adata.uns['svd']` (bool): 693 Whether to calculate SVD reduction. 694 695 `adata.uns['tfidf']` (bool): 696 Whether to calculate TF-IDF per grouping. 697 698 Examples 699 -------- 700 >>> adata = ad.read_h5ad('MCF7_rna.h5ad') 701 >>> group_dict = np.load('hallmark_genesets.pkl', 702 >>> allow_pickle = True) 703 >>> 704 >>> 705 >>> # The labels in adata.obs we want to learn are 'celltypes' 706 >>> adata = scmkl.format_adata(adata, 'celltypes', 707 ... group_dict) 708 >>> adata 709 AnnData object with n_obs × n_vars = 1000 × 4341 710 obs: 'labels' 711 uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type', 712 'distance_metric', 'train_indices', 'test_indices' 713 """ 714 if str == type(adata): 715 adata = ad.read_h5ad(adata) 716 717 if str == type(group_dict): 718 group_dict = np.load(group_dict, allow_pickle=True) 719 720 if str == type(cell_labels): 721 err_msg = f"{cell_labels} is not in `adata.obs`" 722 assert cell_labels in adata.obs.keys(), err_msg 723 cell_labels = adata.obs[cell_labels].to_numpy() 724 725 if use_raw: 726 assert adata.raw, "`adata.raw` is empty, set `use_raw` to `False`" 727 X = adata.raw.X 728 else: 729 X = adata.X 730 731 adata = create_adata(X, adata.var_names.to_numpy().copy(), cell_labels, 732 group_dict, adata.obs_names.to_numpy().copy(), 733 scale_data, transform_data, split_data, D, remove_features, 734 train_ratio, distance_metric, kernel_type, 735 random_state, allow_multiclass, class_threshold, 736 reduction, tfidf) 737 738 return adata
Function to format an ad.AnnData object to carry all relevant
information going forward. adata.obs_names will be retained.
NOTE: Information not needed for running scmkl will be
removed.
Parameters
- adata (ad.AnnData):
Object with data for
scmklto be applied to. Only requirment is that.var_namesis correct and data matrix is inadata.Xoradata.raw.X. A h5ad file can be provided as astrand it will be read in. - cell_labels (np.ndarray | str):
If type
str, the labels forscmklto learn are captured fromadata.obs['cell_labels']. Else, anp.ndarrayof cell phenotypes corresponding with the cells inadata.X. - group_dict (dict | str):
Dictionary containing feature grouping information (i.e.
{geneset1: np.array([gene_1, gene_2, ..., gene_n]), geneset2: np.array([...]), ...}. A pickle file can be provided as astrand it will be read in. - obs_names (None | np.ndarray):
The cell names corresponding to
Xto be assigned to output object.obs_namesattribute. - use_raw (bool):
If
False, will useadata.Xto create newadata. Else, will useadata.raw.X. - scale_data (bool):
If
True, data matrix is log transformed and standard scaled. - transform_data (bool):
If
True, data will be log1p transformed (recommended for counts data). Default isFalse. - split_data (None | np.ndarray):
If
None, data will be split stratified by cell labels. Else, is an array of precalculated train/test split corresponding to samples. Can include labels for entire dataset to benchmark performance or for only training data to classify unknown cell types (i.e.np.array(['train', 'test', ..., 'train']). - D (int):
Number of Random Fourier Features used to calculate Z.
Should be a positive integer. Higher values of D will
increase classification accuracy at the cost of computation
time. If set to
None, will be calculated given number of samples. - remove_features (bool):
If
True, will remove features fromXandfeature_namesnot ingroup_dictand remove features from groupings not infeature_names. - train_ratio (float):
Ratio of number of training samples to entire data set. Note:
if a threshold is applied, the ratio training samples may
decrease depending on class balance and
class_thresholdparameter ifallow_multiclass = True. - distance_metric (str):
The pairwise distance metric used to estimate sigma. Must
be one of the options used in
scipy.spatial.distance.cdist. - kernel_type (str):
The approximated kernel function used to calculate Zs.
Must be one of
'Gaussian','Laplacian', or'Cauchy'. - random_state (int): Integer random_state used to set the seed for reproducibilty.
- allow_multiclass (bool):
If
False, will ensure that cell labels are binary. - class_threshold (str | int):
Number of samples allowed in the training data for each cell
class in the training data. If
'median', the median number of cells per cell class will be the threshold for number of samples per class. - reduction (str | None):
Choose which dimension reduction technique to perform on
features within a group. 'svd' will run
sklearn.decomposition.TruncatedSVD, 'linear' will multiply by an array of 1s down to 50 dimensions. - tfidf (bool): Whether to calculate TFIDF transformation on peaks within groupings.
Returns
adata (ad.AnnData): AnnData with the following attributes and keys:
adata.X(array_like): Data matrix.adata.var_names(array_like): Feature names corresponding toadata.X.adata.obs['labels'](array_like): cell classes/phenotypes fromcell_labels.adata.uns['train_indices'](array_like): Indices for training data.adata.uns['test_indices'](array_like) Indices for testing data.adata.uns['group_dict'](dict): Grouping information.adata.uns['seed_obj'](np.random._generator.Generator): Seed object with seed equal to 100 *random_state.adata.uns['D'](int): Number of dimensions to scMKL with.adata.uns['scale_data'](bool): Whether or not data is scaled.adata.uns['transform_data'](bool): Whether or not data is log1p transformed.adata.uns['distance_metric'](str): Distance metric as given.adata.uns['kernel_type'](str): Kernel function as given.adata.uns['svd'](bool): Whether to calculate SVD reduction.adata.uns['tfidf'](bool): Whether to calculate TF-IDF per grouping.
Examples
>>> adata = ad.read_h5ad('MCF7_rna.h5ad')
>>> group_dict = np.load('hallmark_genesets.pkl',
>>> allow_pickle = True)
>>>
>>>
>>> # The labels in adata.obs we want to learn are 'celltypes'
>>> adata = scmkl.format_adata(adata, 'celltypes',
... group_dict)
>>> adata
AnnData object with n_obs × n_vars = 1000 × 4341
obs: 'labels'
uns: 'group_dict', 'seed_obj', 'scale_data', 'D', 'kernel_type',
'distance_metric', 'train_indices', 'test_indices'