scmkl.dataframes
1import os 2import re 3import numpy as np 4import pandas as pd 5 6 7def _parse_result_type(results: dict): 8 """ 9 Returns whether or not there are multiple results per class. 10 11 Parameters 12 ---------- 13 results : dict 14 Either the output of `scmkl.run()` or `scmkl.one_v_rest()` or 15 a dictionary of those results. 16 17 Returns 18 ------- 19 is_mult, is_many : bool, bool 20 If `is_mult` is `True`, then results are multiclass. If 21 `is_many` is `True`, results contain multiple outputs. 22 23 """ 24 # Single result cases 25 if 'Classes' in results.keys(): 26 is_mult = True 27 is_many = False 28 return is_mult, is_many 29 elif 'Norms' in results.keys(): 30 is_mult = False 31 is_many = False 32 return is_mult, is_many 33 34 # Multiresult cases 35 keys = list(results.keys()) 36 if 'Classes' in results[keys[0]].keys(): 37 is_mult = True 38 is_many = True 39 return is_mult, is_many 40 elif 'Norms' in results[keys[0]].keys(): 41 is_mult = False 42 is_many = True 43 return is_mult, is_many 44 else: 45 print("Unknown result structure", flush=True) 46 47 48def sort_groups(df: pd.DataFrame, group_col: str='Group', 49 norm_col: str='Kernel Weight'): 50 """ 51 Takes a dataframe with `group_col` and returns sorted group list 52 with groups in decending order by their weights. Assumes there is 53 one instance of each group. 54 55 Parameters 56 ---------- 57 df : pd.DataFrame 58 A dataframe with `group_col` and `norm_col` to be sorted by. 59 60 group_col : str 61 The column containing the group names. 62 63 norm_col : str 64 The column containing the kernel weights. 65 66 Returns 67 ------- 68 group_order : list 69 A list of groups in descending order according to their kernel 70 weights. 71 72 Examples 73 -------- 74 >>> result = scmkl.run(adata, alpha_list) 75 >>> weights = scmkl.get_weights(result) 76 >>> group_order = scmkl.sort_groups(weights, 'Group', 77 ... 'Kernel Weight') 78 >>> 79 >>> group_order 80 ['HALLMARK_ESTROGEN_RESPONSE_EARLY', 'HALLM...', ...] 81 """ 82 df = df.copy() 83 df = df.sort_values(norm_col, ascending=False) 84 group_order = list(df[group_col]) 85 86 return group_order 87 88 89def format_group_names(group_names: list | pd.Series | np.ndarray, 90 rm_words: list=list()): 91 """ 92 Takes an ArrayLike object of group names and formats them. 93 94 Parameters 95 ---------- 96 group_names : array_like 97 An array of group names to format. 98 99 rm_words : list 100 Words to remove from all group names. 101 102 Returns 103 ------- 104 new_group_names : list 105 Formatted version of the input group names. 106 107 Examples 108 -------- 109 >>> groups = ['HALLMARK_E2F_TARGETS', 'HALLMARK_HYPOXIA'] 110 >>> new_groups = scmkl.format_group_names(groups) 111 >>> new_groups 112 ['Hallmark E2F Targets', 'Hallmark Hypoxia'] 113 """ 114 new_group_names = list() 115 rm_words = [word.lower() for word in rm_words] 116 117 for name in group_names: 118 new_name = list() 119 for word in re.split(r'_|\s', name): 120 if word.isalpha() and (len(word) > 3): 121 word = word.capitalize() 122 if word.lower() not in rm_words: 123 new_name.append(word) 124 new_name = ' '.join(new_name) 125 new_group_names.append(new_name) 126 127 return new_group_names 128 129 130def parse_metrics(results: dict, key: str | None=None, 131 include_as: bool=False) -> pd.DataFrame: 132 """ 133 This function returns a pd.DataFrame for a single scMKL result 134 with performance results. 135 136 Parameters 137 ---------- 138 results : dict 139 A result dictionary from `scmkl.run()`. 140 141 key : str 142 If specified, will add a key column to the output dataframe 143 where each element is `key`. 144 145 include_as : bool 146 If `True`, will add a column indicating which models' used 147 the optimal alphas. 148 149 Returns 150 ------- 151 df : pd.DataFrame 152 A dataframe with columns `['Alpha', 'Metric', 'Value']`. 153 `'Key'` col only added if `key` is not `None`. 154 """ 155 df = { 156 'Alpha' : list(), 157 'Metric' : list(), 158 'Value' : list() 159 } 160 161 # Check if is a multiclass result 162 is_mult, _ = _parse_result_type(results) 163 164 if is_mult: 165 df['Class'] = list() 166 167 # Ensuring results is a scMKL result and checking multiclass 168 if 'Metrics' in results.keys(): 169 for alpha in results['Metrics'].keys(): 170 for metric, value in results['Metrics'][alpha].items(): 171 df['Alpha'].append(alpha) 172 df['Metric'].append(metric) 173 df['Value'].append(value) 174 175 elif 'Classes' in results.keys(): 176 for ct in results['Classes']: 177 for alpha in results[ct]['Metrics'].keys(): 178 for metric, value in results[ct]['Metrics'][alpha].items(): 179 df['Alpha'].append(alpha) 180 df['Metric'].append(metric) 181 df['Value'].append(value) 182 df['Class'].append(ct) 183 184 else: 185 print(f"{key} is not a scMKL result and will be ignored.") 186 187 df = pd.DataFrame(df) 188 189 if include_as: 190 assert 'Alpha_star' in results.keys(), "'Alpha_star' not in results" 191 df['Alpha Star'] = df['Alpha'] == results['Alpha_star'] 192 193 if key is not None: 194 df['Key'] = [key] * df.shape[0] 195 196 return df 197 198 199def parse_weights(results: dict, include_as: bool=False, 200 key: None | str=None) -> pd.DataFrame: 201 """ 202 This function returns a pd.DataFrame for a single scMKL result 203 with group weights. 204 205 Parameters 206 ---------- 207 results : dict 208 A result dictionary from `scmkl.run()`. 209 210 key : str 211 If specified, will add a key column to the output dataframe 212 where each element is `key`. 213 214 include_as : bool 215 If `True`, will add a column indicating which models' used 216 the optimal alphas. 217 218 Returns 219 ------- 220 df : pd.DataFrame 221 A dataframe with columns `['Alpha', 'Group', 222 'Kernel Weight']`. `'Key'` col only added if `key` is not 223 `None`. 224 """ 225 df = { 226 'Alpha' : list(), 227 'Group' : list(), 228 'Kernel Weight' : list() 229 } 230 231 # Check if is a multiclass result 232 is_mult, _ = _parse_result_type(results) 233 234 if is_mult: 235 df['Class'] = list() 236 237 # Ensuring results is a scMKL result and checking multiclass 238 if 'Norms' in results.keys(): 239 for alpha in results['Norms'].keys(): 240 df['Alpha'].extend([alpha]*len(results['Norms'][alpha])) 241 df['Group'].extend(results['Group_names']) 242 df['Kernel Weight'].extend(results['Norms'][alpha]) 243 244 elif 'Classes' in results.keys(): 245 for ct in results['Classes']: 246 for alpha in results[ct]['Norms'].keys(): 247 df['Alpha'].extend([alpha] * len(results[ct]['Norms'][alpha])) 248 df['Group'].extend(results[ct]['Group_names']) 249 df['Kernel Weight'].extend(results[ct]['Norms'][alpha]) 250 df['Class'].extend([ct]*len(results[ct]['Norms'][alpha])) 251 252 df = pd.DataFrame(df) 253 254 if include_as: 255 df['Alpha Star'] = df['Alpha'] == results['Alpha_star'] 256 257 if key is not None: 258 df['Key'] = [key] * df.shape[0] 259 260 return df 261 262 263def extract_results(results: dict, metric: str): 264 """ 265 266 """ 267 summary = {'Alpha' : list(), 268 metric : list(), 269 'Number of Selected Groups' : list(), 270 'Top Group' : list()} 271 272 alpha_list = list(results['Metrics'].keys()) 273 274 # Creating summary DataFrame for each model 275 for alpha in alpha_list: 276 cur_alpha_rows = results['Norms'][alpha] 277 top_weight_rows = np.max(results['Norms'][alpha]) 278 top_group_index = np.where(cur_alpha_rows == top_weight_rows) 279 num_selected = len(results['Selected_groups'][alpha]) 280 top_group_name = np.array(results['Group_names'])[top_group_index] 281 282 if 0 == num_selected: 283 top_group_name = ["No groups selected"] 284 285 summary['Alpha'].append(alpha) 286 summary[metric].append(results['Metrics'][alpha][metric]) 287 summary['Number of Selected Groups'].append(num_selected) 288 summary['Top Group'].append(*top_group_name) 289 290 return pd.DataFrame(summary) 291 292 293def get_summary(results: dict, metric: str='AUROC'): 294 """ 295 Takes the results from `scmkl.run()` and generates a dataframe 296 for each model containing columns for alpha, area under the ROC, 297 number of groups with nonzero weights, and highest weighted 298 group. 299 300 Parameters 301 ---------- 302 results : dict 303 A dictionary of results from scMKL generated from 304 `scmkl.run()`. 305 306 metric : str 307 Which metric to include in the summary. Default is AUROC. 308 Options include `'AUROC'`, `'Recall'`, `'Precision'`, 309 `'Accuracy'`, and `'F1-Score'`. 310 311 Returns 312 ------- 313 summary_df : pd.DataFrame 314 A table with columns: `['Alpha', 'AUROC', 315 'Number of Selected Groups', 'Top Group']`. 316 317 Examples 318 -------- 319 >>> results = scmkl.run(adata, alpha_list) 320 >>> summary_df = scmkl.get_summary(results) 321 ... 322 >>> summary_df.head() 323 Alpha AUROC Number of Selected Groups 324 0 2.20 0.8600 3 325 1 1.96 0.9123 4 326 2 1.72 0.9357 5 327 3 1.48 0.9524 7 328 4 1.24 0.9666 9 329 Top Group 330 0 RNA-HALLMARK_E2F_TARGETS 331 1 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY 332 2 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY 333 3 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY 334 4 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY 335 """ 336 is_multi, is_many = _parse_result_type(results) 337 assert not is_many, "This function only supports single results" 338 339 if is_multi: 340 summaries = list() 341 for ct in results['Classes']: 342 data = extract_results(results[ct], metric) 343 data['Class'] = [ct]*len(data) 344 summaries.append(data.copy()) 345 summary = pd.concat(summaries) 346 347 else: 348 summary = extract_results(results, metric) 349 350 return summary 351 352 353def read_files(dir: str, pattern: str | None=None) -> dict: 354 """ 355 This function takes a directory of scMKL results as pickle files 356 and returns a dictionary with the file names as keys and the data 357 from the respective files as the values. 358 359 Parameters 360 ---------- 361 dir : str 362 A string specifying the file path for the output scMKL runs. 363 364 pattern : str 365 A regex string for filtering down to desired files. If 366 `None`, all files in the directory with the pickle file 367 extension will be added to the dictionary. 368 369 Returns 370 ------- 371 results : dict 372 A dictionary with the file names as keys and data as values. 373 374 Examples 375 -------- 376 >>> filepath = 'scMKL_results/rna+atac/' 377 ... 378 >>> all_results = scmkl.read_files(filepath) 379 >>> all_results.keys() 380 dict_keys(['Rep_1.pkl', Rep_2.pkl, Rep_3.pkl, ...]) 381 """ 382 # Reading all pickle files in patter is None 383 if pattern is None: 384 data = {file : np.load(f'{dir}/{file}', allow_pickle = True) 385 for file in os.listdir(dir) if '.pkl' in file} 386 387 # Reading only files matching pattern if not None 388 else: 389 pattern = repr(pattern) 390 data = {file : np.load(f'{dir}/{file}', allow_pickle = True) 391 for file in os.listdir(dir) 392 if re.fullmatch(pattern, file) is not None} 393 394 return data 395 396 397def get_metrics(results: dict, include_as: bool=False) -> pd.DataFrame: 398 """ 399 Takes either a single scMKL result or a dictionary where each 400 entry cooresponds to one result. Returns a dataframe with cols 401 ['Alpha', 'Metric', 'Value']. If `include_as == True`, another 402 col of booleans will be added to indicate whether or not the run 403 respective to that alpha was chosen as optimal via CV. If 404 `include_key == True`, another column will be added with the name 405 of the key to the respective file (only applicable with multiple 406 results). 407 408 Parameters 409 ---------- 410 results : dict | None 411 A dictionary with the results of a single run from 412 `scmkl.run()`. Must be `None` if `rfiles is not None`. 413 414 rfiles : dict | None 415 A dictionary of results dictionaries containing multiple 416 results from `scmkl.run()`. 417 418 include_as : bool 419 When `True`, will add a bool col to output pd.DataFrame 420 where rows with alphas cooresponding to alpha_star will be 421 `True`. 422 423 Returns 424 ------- 425 df : pd.DataFrame 426 A pd.DataFrame containing all of the metrics present from 427 the runs input. 428 429 Examples 430 -------- 431 >>> # For a single file 432 >>> results = scmkl.run(adata) 433 >>> metrics = scmkl.get_metrics(results = results) 434 435 >>> # For multiple runs saved in a dict 436 >>> output_dir = 'scMKL_outputs/' 437 >>> rfiles = scmkl.read_files(output_dir) 438 >>> metrics = scmkl.get_metrics(rfiles=rfiles) 439 """ 440 # Checking which data is being worked with 441 is_mult, is_many = _parse_result_type(results) 442 443 # Initiating col list with minimal columns 444 cols = ['Alpha', 'Metric', 'Value'] 445 446 if include_as: 447 cols.append('Alpha Star') 448 if is_mult: 449 cols.append('Class') 450 451 if is_many: 452 cols.append('Key') 453 df = pd.DataFrame(columns = cols) 454 for key, result in results.items(): 455 cur_df = parse_metrics(results = result, key = key, 456 include_as = include_as) 457 df = pd.concat([df, cur_df.copy()]) 458 459 else: 460 df = parse_metrics(results = results, include_as = include_as) 461 462 return df 463 464 465def get_weights(results : dict, include_as : bool = False) -> pd.DataFrame: 466 """ 467 Takes either a single scMKL result or dictionary of results and 468 returns a pd.DataFrame with cols ['Alpha', 'Group', 469 'Kernel Weight']. If `include_as == True`, a fourth col will be 470 added to indicate whether or not the run respective to that alpha 471 was chosen as optimal via cross validation. 472 473 Parameters 474 ---------- 475 results : dict | None 476 A dictionary with the results of a single run from 477 `scmkl.run()`. Must be `None` if `rfiles is not None`. 478 479 rfiles : dict | None 480 A dictionary of results dictionaries containing multiple 481 results from `scmkl.run()`. 482 483 include_as : bool 484 When `True`, will add a bool col to output pd.DataFrame 485 where rows with alphas cooresponding to alpha_star will be 486 `True`. 487 488 Returns 489 ------- 490 df : pd.DataFrame 491 A pd.DataFrame containing all of the groups from each alpha 492 and their cooresponding kernel weights. 493 494 Examples 495 -------- 496 >>> # For a single file 497 >>> results = scmkl.run(adata) 498 >>> weights = scmkl.get_weights(results = results) 499 500 >>> # For multiple runs saved in a dict 501 >>> output_dir = 'scMKL_outputs/' 502 >>> rfiles = scmkl.read_files(output_dir) 503 >>> weights = scmkl.get_weights(rfiles=rfiles) 504 """ 505 # Checking which data is being worked with 506 is_mult, is_many = _parse_result_type(results) 507 508 # Initiating col list with minimal columns 509 cols = ['Alpha', 'Group', 'Kernel Weight'] 510 511 if include_as: 512 cols.append('Alpha Star') 513 if is_mult: 514 cols.append('Class') 515 516 if is_many: 517 cols.append('Key') 518 df = pd.DataFrame(columns = cols) 519 for key, result in results.items(): 520 cur_df = parse_weights(results = result, key = key, 521 include_as = include_as) 522 df = pd.concat([df, cur_df.copy()]) 523 524 else: 525 df = parse_weights(results = results, include_as = include_as) 526 527 return df 528 529 530def get_selection(weights_df: pd.DataFrame, 531 order_groups: bool=False) -> pd.DataFrame: 532 """ 533 This function takes a pd.DataFrame created by 534 `scmkl.get_weights()` and returns a selection table. Selection 535 refers to how many times a group had a nonzero group weight. To 536 calculate this, a col is added indicating whether the group was 537 selected. Then, the dataframe is grouped by alpha and group. 538 Selection can then be summed returning a dataframe with cols 539 `['Alpha', 'Group', Selection]`. If is the result of multiclass 540 run(s), `'Class'` column must be present and will be in resulting 541 df as well. 542 543 Parameters 544 ---------- 545 weights_df : pd.DataFrame 546 A dataframe output by `scmkl.get_weights()` with cols 547 `['Alpha', 'Group', 'Kernel Weight']`. If is the result of 548 multiclass run(s), `'Class'` column must be present as well. 549 550 order_groups : bool 551 If `True`, the `'Group'` col of the output dataframe will be 552 made into a `pd.Categorical` col ordered by number of times 553 each group was selected in decending order. 554 555 Returns 556 ------- 557 df : pd.DataFrame 558 A dataframe with cols `['Alpha', 'Group', Selection]`. Also, 559 `'Class'` column if is a multiclass result. 560 561 Example 562 ------- 563 >>> # For a single file 564 >>> results = scmkl.run(adata) 565 >>> weights = scmkl.get_weights(results = results) 566 >>> selection = scmkl.get_selection(weights) 567 568 >>> # For multiple runs saved in a dict 569 >>> output_dir = 'scMKL_outputs/' 570 >>> rfiles = scmkl.read_files(output_dir) 571 >>> weights = scmkl.get_weights(rfiles=rfiles) 572 >>> selection = scmkl.get_selection(weights) 573 """ 574 # Adding col indicating whether or not groups have nonzero weight 575 selection = weights_df['Kernel Weight'].apply(lambda x: x > 0) 576 weights_df['Selection'] = selection 577 578 # Summing selection across replications to get selection 579 is_mult = 'Class' in weights_df.columns 580 if is_mult: 581 df = weights_df.groupby(['Alpha', 'Group', 'Class'])['Selection'].sum() 582 else: 583 df = weights_df.groupby(['Alpha', 'Group'])['Selection'].sum() 584 df = df.reset_index() 585 586 # Getting group order 587 if order_groups and not is_mult: 588 order = df.groupby('Group')['Selection'].sum() 589 order = order.reset_index().sort_values(by = 'Selection', 590 ascending = False) 591 order = order['Group'] 592 df['Group'] = pd.Categorical(df['Group'], categories = order) 593 594 595 return df 596 597 598def groups_per_alpha(selection_df: pd.DataFrame) -> dict: 599 """ 600 This function takes a pd.DataFrame from `scmkl.get_selection()` 601 generated from multiple scMKL results and returns a dictionary 602 with keys being alphas from the input dataframe and values being 603 the mean number of selected groups for a given alpha across 604 results. 605 606 Parameters 607 ---------- 608 selection_df : pd.DataFrame 609 A dataframe output by `scmkl.get_selection()` with cols 610 `['Alpha', 'Group', Selection]. 611 612 Returns 613 ------- 614 mean_groups : dict 615 A dictionary with alphas as keys and the mean number of 616 selected groups for that alpha as keys. 617 618 Examples 619 -------- 620 >>> weights = scmkl.get_weights(rfiles) 621 >>> selection = scmkl.get_selection(weights) 622 >>> mean_groups = scmkl.mean_groups_per_alpha(selection) 623 >>> mean_groups = {alpha : np.round(num_selected, 1) 624 ... for alpha, num_selected in mean_groups.items()} 625 >>> 626 >>> print(mean_groups) 627 {0.05 : 50.0, 0.2 : 24.7, 1.1 : 5.3} 628 """ 629 mean_groups = {} 630 for alpha in np.unique(selection_df['Alpha']): 631 632 # Capturing rows for given alpha 633 rows = selection_df['Alpha'] == alpha 634 635 # Adding mean number of groups for alpha 636 mean_groups[alpha] = np.mean(selection_df[rows]['Selection']) 637 638 return mean_groups 639 640 641def read_gtf(path: str, filter_to_coding: bool=False): 642 """ 643 Reads and formats a gtf file. Adds colnames: `['chr', 'source', 644 'feature', 'start', 'end', 'score', 'strand', 'frame', 645 'attribute']`. 646 647 Parameters 648 ---------- 649 path : str 650 The file path to the gtf file to be read in. If the file is 651 gzipped, file name must end with .gz. 652 653 filter_to_coding : bool 654 If `True`, will filter rows in gtf data frame to only 655 protein coding genes. Will add column `'gene_name'` containing 656 the gene name for each row. 657 658 Returns 659 ------- 660 df : pd.DataFrame 661 A pandas dataframe of the input gtf file. 662 663 Examples 664 -------- 665 >>> import scmkl 666 >>> 667 >>> file = 'data/hg38_subset_protein_coding.annotation.gtf' 668 >>> gtf = scmkl.read_gtf(file) 669 >>> 670 >>> gtf.head() 671 chr source feature start end score strand frame 672 0 chr1 HAVANA gene 11869 14409 . + . 673 1 chr1 HAVANA transcript 11869 14409 . + . 674 2 chr1 HAVANA exon 11869 12227 . + . 675 3 chr1 HAVANA exon 12613 12721 . + . 676 4 chr1 HAVANA exon 13221 14409 . + . 677 attribute 678 gene_id "ENSG00000223972.5"; gene_type "transc... 679 gene_id "ENSG00000223972.5"; transcript_id "EN... 680 gene_id "ENSG00000223972.5"; transcript_id "EN... 681 gene_id "ENSG00000223972.5"; transcript_id "EN... 682 gene_id "ENSG00000223972.5"; transcript_id "EN... 683 """ 684 df = pd.read_csv(path, sep='\t', comment='#', 685 skip_blank_lines=True, header=None) 686 687 df.columns = ['chr', 'source', 'feature', 'start', 'end', 688 'score', 'strand', 'frame', 'attribute'] 689 690 if filter_to_coding: 691 prot_rows = df['attribute'].str.contains('protein_coding') 692 df = df[prot_rows] 693 df = df[df['feature'] == 'gene'] 694 695 # Capturing and adding gene name to df 696 df['gene_name'] = [re.findall(r'(?<=gene_name ")[A-z0-9]+', 697 attr)[0] 698 for attr in df['attribute']] 699 700 return df
49def sort_groups(df: pd.DataFrame, group_col: str='Group', 50 norm_col: str='Kernel Weight'): 51 """ 52 Takes a dataframe with `group_col` and returns sorted group list 53 with groups in decending order by their weights. Assumes there is 54 one instance of each group. 55 56 Parameters 57 ---------- 58 df : pd.DataFrame 59 A dataframe with `group_col` and `norm_col` to be sorted by. 60 61 group_col : str 62 The column containing the group names. 63 64 norm_col : str 65 The column containing the kernel weights. 66 67 Returns 68 ------- 69 group_order : list 70 A list of groups in descending order according to their kernel 71 weights. 72 73 Examples 74 -------- 75 >>> result = scmkl.run(adata, alpha_list) 76 >>> weights = scmkl.get_weights(result) 77 >>> group_order = scmkl.sort_groups(weights, 'Group', 78 ... 'Kernel Weight') 79 >>> 80 >>> group_order 81 ['HALLMARK_ESTROGEN_RESPONSE_EARLY', 'HALLM...', ...] 82 """ 83 df = df.copy() 84 df = df.sort_values(norm_col, ascending=False) 85 group_order = list(df[group_col]) 86 87 return group_order
Takes a dataframe with group_col
and returns sorted group list
with groups in decending order by their weights. Assumes there is
one instance of each group.
Parameters
- df (pd.DataFrame):
A dataframe with
group_col
andnorm_col
to be sorted by. - group_col (str): The column containing the group names.
- norm_col (str): The column containing the kernel weights.
Returns
- group_order (list): A list of groups in descending order according to their kernel weights.
Examples
>>> result = scmkl.run(adata, alpha_list)
>>> weights = scmkl.get_weights(result)
>>> group_order = scmkl.sort_groups(weights, 'Group',
... 'Kernel Weight')
>>>
>>> group_order
['HALLMARK_ESTROGEN_RESPONSE_EARLY', 'HALLM...', ...]
90def format_group_names(group_names: list | pd.Series | np.ndarray, 91 rm_words: list=list()): 92 """ 93 Takes an ArrayLike object of group names and formats them. 94 95 Parameters 96 ---------- 97 group_names : array_like 98 An array of group names to format. 99 100 rm_words : list 101 Words to remove from all group names. 102 103 Returns 104 ------- 105 new_group_names : list 106 Formatted version of the input group names. 107 108 Examples 109 -------- 110 >>> groups = ['HALLMARK_E2F_TARGETS', 'HALLMARK_HYPOXIA'] 111 >>> new_groups = scmkl.format_group_names(groups) 112 >>> new_groups 113 ['Hallmark E2F Targets', 'Hallmark Hypoxia'] 114 """ 115 new_group_names = list() 116 rm_words = [word.lower() for word in rm_words] 117 118 for name in group_names: 119 new_name = list() 120 for word in re.split(r'_|\s', name): 121 if word.isalpha() and (len(word) > 3): 122 word = word.capitalize() 123 if word.lower() not in rm_words: 124 new_name.append(word) 125 new_name = ' '.join(new_name) 126 new_group_names.append(new_name) 127 128 return new_group_names
Takes an ArrayLike object of group names and formats them.
Parameters
- group_names (array_like): An array of group names to format.
- rm_words (list): Words to remove from all group names.
Returns
- new_group_names (list): Formatted version of the input group names.
Examples
>>> groups = ['HALLMARK_E2F_TARGETS', 'HALLMARK_HYPOXIA']
>>> new_groups = scmkl.format_group_names(groups)
>>> new_groups
['Hallmark E2F Targets', 'Hallmark Hypoxia']
131def parse_metrics(results: dict, key: str | None=None, 132 include_as: bool=False) -> pd.DataFrame: 133 """ 134 This function returns a pd.DataFrame for a single scMKL result 135 with performance results. 136 137 Parameters 138 ---------- 139 results : dict 140 A result dictionary from `scmkl.run()`. 141 142 key : str 143 If specified, will add a key column to the output dataframe 144 where each element is `key`. 145 146 include_as : bool 147 If `True`, will add a column indicating which models' used 148 the optimal alphas. 149 150 Returns 151 ------- 152 df : pd.DataFrame 153 A dataframe with columns `['Alpha', 'Metric', 'Value']`. 154 `'Key'` col only added if `key` is not `None`. 155 """ 156 df = { 157 'Alpha' : list(), 158 'Metric' : list(), 159 'Value' : list() 160 } 161 162 # Check if is a multiclass result 163 is_mult, _ = _parse_result_type(results) 164 165 if is_mult: 166 df['Class'] = list() 167 168 # Ensuring results is a scMKL result and checking multiclass 169 if 'Metrics' in results.keys(): 170 for alpha in results['Metrics'].keys(): 171 for metric, value in results['Metrics'][alpha].items(): 172 df['Alpha'].append(alpha) 173 df['Metric'].append(metric) 174 df['Value'].append(value) 175 176 elif 'Classes' in results.keys(): 177 for ct in results['Classes']: 178 for alpha in results[ct]['Metrics'].keys(): 179 for metric, value in results[ct]['Metrics'][alpha].items(): 180 df['Alpha'].append(alpha) 181 df['Metric'].append(metric) 182 df['Value'].append(value) 183 df['Class'].append(ct) 184 185 else: 186 print(f"{key} is not a scMKL result and will be ignored.") 187 188 df = pd.DataFrame(df) 189 190 if include_as: 191 assert 'Alpha_star' in results.keys(), "'Alpha_star' not in results" 192 df['Alpha Star'] = df['Alpha'] == results['Alpha_star'] 193 194 if key is not None: 195 df['Key'] = [key] * df.shape[0] 196 197 return df
This function returns a pd.DataFrame for a single scMKL result with performance results.
Parameters
- results (dict):
A result dictionary from
scmkl.run
. - key (str):
If specified, will add a key column to the output dataframe
where each element is
key
. - include_as (bool):
If
True
, will add a column indicating which models' used the optimal alphas.
Returns
- df (pd.DataFrame):
A dataframe with columns
['Alpha', 'Metric', 'Value']
.'Key'
col only added ifkey
is notNone
.
200def parse_weights(results: dict, include_as: bool=False, 201 key: None | str=None) -> pd.DataFrame: 202 """ 203 This function returns a pd.DataFrame for a single scMKL result 204 with group weights. 205 206 Parameters 207 ---------- 208 results : dict 209 A result dictionary from `scmkl.run()`. 210 211 key : str 212 If specified, will add a key column to the output dataframe 213 where each element is `key`. 214 215 include_as : bool 216 If `True`, will add a column indicating which models' used 217 the optimal alphas. 218 219 Returns 220 ------- 221 df : pd.DataFrame 222 A dataframe with columns `['Alpha', 'Group', 223 'Kernel Weight']`. `'Key'` col only added if `key` is not 224 `None`. 225 """ 226 df = { 227 'Alpha' : list(), 228 'Group' : list(), 229 'Kernel Weight' : list() 230 } 231 232 # Check if is a multiclass result 233 is_mult, _ = _parse_result_type(results) 234 235 if is_mult: 236 df['Class'] = list() 237 238 # Ensuring results is a scMKL result and checking multiclass 239 if 'Norms' in results.keys(): 240 for alpha in results['Norms'].keys(): 241 df['Alpha'].extend([alpha]*len(results['Norms'][alpha])) 242 df['Group'].extend(results['Group_names']) 243 df['Kernel Weight'].extend(results['Norms'][alpha]) 244 245 elif 'Classes' in results.keys(): 246 for ct in results['Classes']: 247 for alpha in results[ct]['Norms'].keys(): 248 df['Alpha'].extend([alpha] * len(results[ct]['Norms'][alpha])) 249 df['Group'].extend(results[ct]['Group_names']) 250 df['Kernel Weight'].extend(results[ct]['Norms'][alpha]) 251 df['Class'].extend([ct]*len(results[ct]['Norms'][alpha])) 252 253 df = pd.DataFrame(df) 254 255 if include_as: 256 df['Alpha Star'] = df['Alpha'] == results['Alpha_star'] 257 258 if key is not None: 259 df['Key'] = [key] * df.shape[0] 260 261 return df
This function returns a pd.DataFrame for a single scMKL result with group weights.
Parameters
- results (dict):
A result dictionary from
scmkl.run
. - key (str):
If specified, will add a key column to the output dataframe
where each element is
key
. - include_as (bool):
If
True
, will add a column indicating which models' used the optimal alphas.
Returns
- df (pd.DataFrame):
A dataframe with columns
['Alpha', 'Group', 'Kernel Weight']
.'Key'
col only added ifkey
is notNone
.
264def extract_results(results: dict, metric: str): 265 """ 266 267 """ 268 summary = {'Alpha' : list(), 269 metric : list(), 270 'Number of Selected Groups' : list(), 271 'Top Group' : list()} 272 273 alpha_list = list(results['Metrics'].keys()) 274 275 # Creating summary DataFrame for each model 276 for alpha in alpha_list: 277 cur_alpha_rows = results['Norms'][alpha] 278 top_weight_rows = np.max(results['Norms'][alpha]) 279 top_group_index = np.where(cur_alpha_rows == top_weight_rows) 280 num_selected = len(results['Selected_groups'][alpha]) 281 top_group_name = np.array(results['Group_names'])[top_group_index] 282 283 if 0 == num_selected: 284 top_group_name = ["No groups selected"] 285 286 summary['Alpha'].append(alpha) 287 summary[metric].append(results['Metrics'][alpha][metric]) 288 summary['Number of Selected Groups'].append(num_selected) 289 summary['Top Group'].append(*top_group_name) 290 291 return pd.DataFrame(summary)
294def get_summary(results: dict, metric: str='AUROC'): 295 """ 296 Takes the results from `scmkl.run()` and generates a dataframe 297 for each model containing columns for alpha, area under the ROC, 298 number of groups with nonzero weights, and highest weighted 299 group. 300 301 Parameters 302 ---------- 303 results : dict 304 A dictionary of results from scMKL generated from 305 `scmkl.run()`. 306 307 metric : str 308 Which metric to include in the summary. Default is AUROC. 309 Options include `'AUROC'`, `'Recall'`, `'Precision'`, 310 `'Accuracy'`, and `'F1-Score'`. 311 312 Returns 313 ------- 314 summary_df : pd.DataFrame 315 A table with columns: `['Alpha', 'AUROC', 316 'Number of Selected Groups', 'Top Group']`. 317 318 Examples 319 -------- 320 >>> results = scmkl.run(adata, alpha_list) 321 >>> summary_df = scmkl.get_summary(results) 322 ... 323 >>> summary_df.head() 324 Alpha AUROC Number of Selected Groups 325 0 2.20 0.8600 3 326 1 1.96 0.9123 4 327 2 1.72 0.9357 5 328 3 1.48 0.9524 7 329 4 1.24 0.9666 9 330 Top Group 331 0 RNA-HALLMARK_E2F_TARGETS 332 1 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY 333 2 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY 334 3 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY 335 4 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY 336 """ 337 is_multi, is_many = _parse_result_type(results) 338 assert not is_many, "This function only supports single results" 339 340 if is_multi: 341 summaries = list() 342 for ct in results['Classes']: 343 data = extract_results(results[ct], metric) 344 data['Class'] = [ct]*len(data) 345 summaries.append(data.copy()) 346 summary = pd.concat(summaries) 347 348 else: 349 summary = extract_results(results, metric) 350 351 return summary
Takes the results from scmkl.run
and generates a dataframe
for each model containing columns for alpha, area under the ROC,
number of groups with nonzero weights, and highest weighted
group.
Parameters
- results (dict):
A dictionary of results from scMKL generated from
scmkl.run
. - metric (str):
Which metric to include in the summary. Default is AUROC.
Options include
'AUROC'
,'Recall'
,'Precision'
,'Accuracy'
, and'F1-Score'
.
Returns
- summary_df (pd.DataFrame):
A table with columns:
['Alpha', 'AUROC', 'Number of Selected Groups', 'Top Group']
.
Examples
>>> results = scmkl.run(adata, alpha_list)
>>> summary_df = scmkl.get_summary(results)
...
>>> summary_df.head()
Alpha AUROC Number of Selected Groups
0 2.20 0.8600 3
1 1.96 0.9123 4
2 1.72 0.9357 5
3 1.48 0.9524 7
4 1.24 0.9666 9
Top Group
0 RNA-HALLMARK_E2F_TARGETS
1 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
2 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
3 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
4 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
354def read_files(dir: str, pattern: str | None=None) -> dict: 355 """ 356 This function takes a directory of scMKL results as pickle files 357 and returns a dictionary with the file names as keys and the data 358 from the respective files as the values. 359 360 Parameters 361 ---------- 362 dir : str 363 A string specifying the file path for the output scMKL runs. 364 365 pattern : str 366 A regex string for filtering down to desired files. If 367 `None`, all files in the directory with the pickle file 368 extension will be added to the dictionary. 369 370 Returns 371 ------- 372 results : dict 373 A dictionary with the file names as keys and data as values. 374 375 Examples 376 -------- 377 >>> filepath = 'scMKL_results/rna+atac/' 378 ... 379 >>> all_results = scmkl.read_files(filepath) 380 >>> all_results.keys() 381 dict_keys(['Rep_1.pkl', Rep_2.pkl, Rep_3.pkl, ...]) 382 """ 383 # Reading all pickle files in patter is None 384 if pattern is None: 385 data = {file : np.load(f'{dir}/{file}', allow_pickle = True) 386 for file in os.listdir(dir) if '.pkl' in file} 387 388 # Reading only files matching pattern if not None 389 else: 390 pattern = repr(pattern) 391 data = {file : np.load(f'{dir}/{file}', allow_pickle = True) 392 for file in os.listdir(dir) 393 if re.fullmatch(pattern, file) is not None} 394 395 return data
This function takes a directory of scMKL results as pickle files and returns a dictionary with the file names as keys and the data from the respective files as the values.
Parameters
- dir (str): A string specifying the file path for the output scMKL runs.
- pattern (str):
A regex string for filtering down to desired files. If
None
, all files in the directory with the pickle file extension will be added to the dictionary.
Returns
- results (dict): A dictionary with the file names as keys and data as values.
Examples
>>> filepath = 'scMKL_results/rna+atac/'
...
>>> all_results = scmkl.read_files(filepath)
>>> all_results.keys()
dict_keys(['Rep_1.pkl', Rep_2.pkl, Rep_3.pkl, ...])
398def get_metrics(results: dict, include_as: bool=False) -> pd.DataFrame: 399 """ 400 Takes either a single scMKL result or a dictionary where each 401 entry cooresponds to one result. Returns a dataframe with cols 402 ['Alpha', 'Metric', 'Value']. If `include_as == True`, another 403 col of booleans will be added to indicate whether or not the run 404 respective to that alpha was chosen as optimal via CV. If 405 `include_key == True`, another column will be added with the name 406 of the key to the respective file (only applicable with multiple 407 results). 408 409 Parameters 410 ---------- 411 results : dict | None 412 A dictionary with the results of a single run from 413 `scmkl.run()`. Must be `None` if `rfiles is not None`. 414 415 rfiles : dict | None 416 A dictionary of results dictionaries containing multiple 417 results from `scmkl.run()`. 418 419 include_as : bool 420 When `True`, will add a bool col to output pd.DataFrame 421 where rows with alphas cooresponding to alpha_star will be 422 `True`. 423 424 Returns 425 ------- 426 df : pd.DataFrame 427 A pd.DataFrame containing all of the metrics present from 428 the runs input. 429 430 Examples 431 -------- 432 >>> # For a single file 433 >>> results = scmkl.run(adata) 434 >>> metrics = scmkl.get_metrics(results = results) 435 436 >>> # For multiple runs saved in a dict 437 >>> output_dir = 'scMKL_outputs/' 438 >>> rfiles = scmkl.read_files(output_dir) 439 >>> metrics = scmkl.get_metrics(rfiles=rfiles) 440 """ 441 # Checking which data is being worked with 442 is_mult, is_many = _parse_result_type(results) 443 444 # Initiating col list with minimal columns 445 cols = ['Alpha', 'Metric', 'Value'] 446 447 if include_as: 448 cols.append('Alpha Star') 449 if is_mult: 450 cols.append('Class') 451 452 if is_many: 453 cols.append('Key') 454 df = pd.DataFrame(columns = cols) 455 for key, result in results.items(): 456 cur_df = parse_metrics(results = result, key = key, 457 include_as = include_as) 458 df = pd.concat([df, cur_df.copy()]) 459 460 else: 461 df = parse_metrics(results = results, include_as = include_as) 462 463 return df
Takes either a single scMKL result or a dictionary where each
entry cooresponds to one result. Returns a dataframe with cols
['Alpha', 'Metric', 'Value']. If include_as == True
, another
col of booleans will be added to indicate whether or not the run
respective to that alpha was chosen as optimal via CV. If
include_key == True
, another column will be added with the name
of the key to the respective file (only applicable with multiple
results).
Parameters
- results (dict | None):
A dictionary with the results of a single run from
scmkl.run
. Must beNone
ifrfiles is not None
. - rfiles (dict | None):
A dictionary of results dictionaries containing multiple
results from
scmkl.run
. - include_as (bool):
When
True
, will add a bool col to output pd.DataFrame where rows with alphas cooresponding to alpha_star will beTrue
.
Returns
- df (pd.DataFrame): A pd.DataFrame containing all of the metrics present from the runs input.
Examples
>>> # For a single file
>>> results = scmkl.run(adata)
>>> metrics = scmkl.get_metrics(results = results)
>>> # For multiple runs saved in a dict
>>> output_dir = 'scMKL_outputs/'
>>> rfiles = scmkl.read_files(output_dir)
>>> metrics = scmkl.get_metrics(rfiles=rfiles)
466def get_weights(results : dict, include_as : bool = False) -> pd.DataFrame: 467 """ 468 Takes either a single scMKL result or dictionary of results and 469 returns a pd.DataFrame with cols ['Alpha', 'Group', 470 'Kernel Weight']. If `include_as == True`, a fourth col will be 471 added to indicate whether or not the run respective to that alpha 472 was chosen as optimal via cross validation. 473 474 Parameters 475 ---------- 476 results : dict | None 477 A dictionary with the results of a single run from 478 `scmkl.run()`. Must be `None` if `rfiles is not None`. 479 480 rfiles : dict | None 481 A dictionary of results dictionaries containing multiple 482 results from `scmkl.run()`. 483 484 include_as : bool 485 When `True`, will add a bool col to output pd.DataFrame 486 where rows with alphas cooresponding to alpha_star will be 487 `True`. 488 489 Returns 490 ------- 491 df : pd.DataFrame 492 A pd.DataFrame containing all of the groups from each alpha 493 and their cooresponding kernel weights. 494 495 Examples 496 -------- 497 >>> # For a single file 498 >>> results = scmkl.run(adata) 499 >>> weights = scmkl.get_weights(results = results) 500 501 >>> # For multiple runs saved in a dict 502 >>> output_dir = 'scMKL_outputs/' 503 >>> rfiles = scmkl.read_files(output_dir) 504 >>> weights = scmkl.get_weights(rfiles=rfiles) 505 """ 506 # Checking which data is being worked with 507 is_mult, is_many = _parse_result_type(results) 508 509 # Initiating col list with minimal columns 510 cols = ['Alpha', 'Group', 'Kernel Weight'] 511 512 if include_as: 513 cols.append('Alpha Star') 514 if is_mult: 515 cols.append('Class') 516 517 if is_many: 518 cols.append('Key') 519 df = pd.DataFrame(columns = cols) 520 for key, result in results.items(): 521 cur_df = parse_weights(results = result, key = key, 522 include_as = include_as) 523 df = pd.concat([df, cur_df.copy()]) 524 525 else: 526 df = parse_weights(results = results, include_as = include_as) 527 528 return df
Takes either a single scMKL result or dictionary of results and
returns a pd.DataFrame with cols ['Alpha', 'Group',
'Kernel Weight']. If include_as == True
, a fourth col will be
added to indicate whether or not the run respective to that alpha
was chosen as optimal via cross validation.
Parameters
- results (dict | None):
A dictionary with the results of a single run from
scmkl.run
. Must beNone
ifrfiles is not None
. - rfiles (dict | None):
A dictionary of results dictionaries containing multiple
results from
scmkl.run
. - include_as (bool):
When
True
, will add a bool col to output pd.DataFrame where rows with alphas cooresponding to alpha_star will beTrue
.
Returns
- df (pd.DataFrame): A pd.DataFrame containing all of the groups from each alpha and their cooresponding kernel weights.
Examples
>>> # For a single file
>>> results = scmkl.run(adata)
>>> weights = scmkl.get_weights(results = results)
>>> # For multiple runs saved in a dict
>>> output_dir = 'scMKL_outputs/'
>>> rfiles = scmkl.read_files(output_dir)
>>> weights = scmkl.get_weights(rfiles=rfiles)
531def get_selection(weights_df: pd.DataFrame, 532 order_groups: bool=False) -> pd.DataFrame: 533 """ 534 This function takes a pd.DataFrame created by 535 `scmkl.get_weights()` and returns a selection table. Selection 536 refers to how many times a group had a nonzero group weight. To 537 calculate this, a col is added indicating whether the group was 538 selected. Then, the dataframe is grouped by alpha and group. 539 Selection can then be summed returning a dataframe with cols 540 `['Alpha', 'Group', Selection]`. If is the result of multiclass 541 run(s), `'Class'` column must be present and will be in resulting 542 df as well. 543 544 Parameters 545 ---------- 546 weights_df : pd.DataFrame 547 A dataframe output by `scmkl.get_weights()` with cols 548 `['Alpha', 'Group', 'Kernel Weight']`. If is the result of 549 multiclass run(s), `'Class'` column must be present as well. 550 551 order_groups : bool 552 If `True`, the `'Group'` col of the output dataframe will be 553 made into a `pd.Categorical` col ordered by number of times 554 each group was selected in decending order. 555 556 Returns 557 ------- 558 df : pd.DataFrame 559 A dataframe with cols `['Alpha', 'Group', Selection]`. Also, 560 `'Class'` column if is a multiclass result. 561 562 Example 563 ------- 564 >>> # For a single file 565 >>> results = scmkl.run(adata) 566 >>> weights = scmkl.get_weights(results = results) 567 >>> selection = scmkl.get_selection(weights) 568 569 >>> # For multiple runs saved in a dict 570 >>> output_dir = 'scMKL_outputs/' 571 >>> rfiles = scmkl.read_files(output_dir) 572 >>> weights = scmkl.get_weights(rfiles=rfiles) 573 >>> selection = scmkl.get_selection(weights) 574 """ 575 # Adding col indicating whether or not groups have nonzero weight 576 selection = weights_df['Kernel Weight'].apply(lambda x: x > 0) 577 weights_df['Selection'] = selection 578 579 # Summing selection across replications to get selection 580 is_mult = 'Class' in weights_df.columns 581 if is_mult: 582 df = weights_df.groupby(['Alpha', 'Group', 'Class'])['Selection'].sum() 583 else: 584 df = weights_df.groupby(['Alpha', 'Group'])['Selection'].sum() 585 df = df.reset_index() 586 587 # Getting group order 588 if order_groups and not is_mult: 589 order = df.groupby('Group')['Selection'].sum() 590 order = order.reset_index().sort_values(by = 'Selection', 591 ascending = False) 592 order = order['Group'] 593 df['Group'] = pd.Categorical(df['Group'], categories = order) 594 595 596 return df
This function takes a pd.DataFrame created by
scmkl.get_weights()
and returns a selection table. Selection
refers to how many times a group had a nonzero group weight. To
calculate this, a col is added indicating whether the group was
selected. Then, the dataframe is grouped by alpha and group.
Selection can then be summed returning a dataframe with cols
['Alpha', 'Group', Selection]
. If is the result of multiclass
run(s), 'Class'
column must be present and will be in resulting
df as well.
Parameters
- weights_df (pd.DataFrame):
A dataframe output by
scmkl.get_weights()
with cols['Alpha', 'Group', 'Kernel Weight']
. If is the result of multiclass run(s),'Class'
column must be present as well. - order_groups (bool):
If
True
, the'Group'
col of the output dataframe will be made into apd.Categorical
col ordered by number of times each group was selected in decending order.
Returns
- df (pd.DataFrame):
A dataframe with cols
['Alpha', 'Group', Selection]
. Also,'Class'
column if is a multiclass result.
Example
>>> # For a single file
>>> results = scmkl.run(adata)
>>> weights = scmkl.get_weights(results = results)
>>> selection = scmkl.get_selection(weights)
>>> # For multiple runs saved in a dict
>>> output_dir = 'scMKL_outputs/'
>>> rfiles = scmkl.read_files(output_dir)
>>> weights = scmkl.get_weights(rfiles=rfiles)
>>> selection = scmkl.get_selection(weights)
599def groups_per_alpha(selection_df: pd.DataFrame) -> dict: 600 """ 601 This function takes a pd.DataFrame from `scmkl.get_selection()` 602 generated from multiple scMKL results and returns a dictionary 603 with keys being alphas from the input dataframe and values being 604 the mean number of selected groups for a given alpha across 605 results. 606 607 Parameters 608 ---------- 609 selection_df : pd.DataFrame 610 A dataframe output by `scmkl.get_selection()` with cols 611 `['Alpha', 'Group', Selection]. 612 613 Returns 614 ------- 615 mean_groups : dict 616 A dictionary with alphas as keys and the mean number of 617 selected groups for that alpha as keys. 618 619 Examples 620 -------- 621 >>> weights = scmkl.get_weights(rfiles) 622 >>> selection = scmkl.get_selection(weights) 623 >>> mean_groups = scmkl.mean_groups_per_alpha(selection) 624 >>> mean_groups = {alpha : np.round(num_selected, 1) 625 ... for alpha, num_selected in mean_groups.items()} 626 >>> 627 >>> print(mean_groups) 628 {0.05 : 50.0, 0.2 : 24.7, 1.1 : 5.3} 629 """ 630 mean_groups = {} 631 for alpha in np.unique(selection_df['Alpha']): 632 633 # Capturing rows for given alpha 634 rows = selection_df['Alpha'] == alpha 635 636 # Adding mean number of groups for alpha 637 mean_groups[alpha] = np.mean(selection_df[rows]['Selection']) 638 639 return mean_groups
This function takes a pd.DataFrame from scmkl.get_selection()
generated from multiple scMKL results and returns a dictionary
with keys being alphas from the input dataframe and values being
the mean number of selected groups for a given alpha across
results.
Parameters
- selection_df (pd.DataFrame):
A dataframe output by
scmkl.get_selection()
with cols `['Alpha', 'Group', Selection].
Returns
- mean_groups (dict): A dictionary with alphas as keys and the mean number of selected groups for that alpha as keys.
Examples
>>> weights = scmkl.get_weights(rfiles)
>>> selection = scmkl.get_selection(weights)
>>> mean_groups = scmkl.mean_groups_per_alpha(selection)
>>> mean_groups = {alpha : np.round(num_selected, 1)
... for alpha, num_selected in mean_groups.items()}
>>>
>>> print(mean_groups)
{0.05 : 50.0, 0.2 : 24.7, 1.1 : 5.3}
642def read_gtf(path: str, filter_to_coding: bool=False): 643 """ 644 Reads and formats a gtf file. Adds colnames: `['chr', 'source', 645 'feature', 'start', 'end', 'score', 'strand', 'frame', 646 'attribute']`. 647 648 Parameters 649 ---------- 650 path : str 651 The file path to the gtf file to be read in. If the file is 652 gzipped, file name must end with .gz. 653 654 filter_to_coding : bool 655 If `True`, will filter rows in gtf data frame to only 656 protein coding genes. Will add column `'gene_name'` containing 657 the gene name for each row. 658 659 Returns 660 ------- 661 df : pd.DataFrame 662 A pandas dataframe of the input gtf file. 663 664 Examples 665 -------- 666 >>> import scmkl 667 >>> 668 >>> file = 'data/hg38_subset_protein_coding.annotation.gtf' 669 >>> gtf = scmkl.read_gtf(file) 670 >>> 671 >>> gtf.head() 672 chr source feature start end score strand frame 673 0 chr1 HAVANA gene 11869 14409 . + . 674 1 chr1 HAVANA transcript 11869 14409 . + . 675 2 chr1 HAVANA exon 11869 12227 . + . 676 3 chr1 HAVANA exon 12613 12721 . + . 677 4 chr1 HAVANA exon 13221 14409 . + . 678 attribute 679 gene_id "ENSG00000223972.5"; gene_type "transc... 680 gene_id "ENSG00000223972.5"; transcript_id "EN... 681 gene_id "ENSG00000223972.5"; transcript_id "EN... 682 gene_id "ENSG00000223972.5"; transcript_id "EN... 683 gene_id "ENSG00000223972.5"; transcript_id "EN... 684 """ 685 df = pd.read_csv(path, sep='\t', comment='#', 686 skip_blank_lines=True, header=None) 687 688 df.columns = ['chr', 'source', 'feature', 'start', 'end', 689 'score', 'strand', 'frame', 'attribute'] 690 691 if filter_to_coding: 692 prot_rows = df['attribute'].str.contains('protein_coding') 693 df = df[prot_rows] 694 df = df[df['feature'] == 'gene'] 695 696 # Capturing and adding gene name to df 697 df['gene_name'] = [re.findall(r'(?<=gene_name ")[A-z0-9]+', 698 attr)[0] 699 for attr in df['attribute']] 700 701 return df
Reads and formats a gtf file. Adds colnames: ['chr', 'source',
'feature', 'start', 'end', 'score', 'strand', 'frame',
'attribute']
.
Parameters
- path (str): The file path to the gtf file to be read in. If the file is gzipped, file name must end with .gz.
- filter_to_coding (bool):
If
True
, will filter rows in gtf data frame to only protein coding genes. Will add column'gene_name'
containing the gene name for each row.
Returns
- df (pd.DataFrame): A pandas dataframe of the input gtf file.
Examples
>>> import scmkl
>>>
>>> file = 'data/hg38_subset_protein_coding.annotation.gtf'
>>> gtf = scmkl.read_gtf(file)
>>>
>>> gtf.head()
chr source feature start end score strand frame
0 chr1 HAVANA gene 11869 14409 . + .
1 chr1 HAVANA transcript 11869 14409 . + .
2 chr1 HAVANA exon 11869 12227 . + .
3 chr1 HAVANA exon 12613 12721 . + .
4 chr1 HAVANA exon 13221 14409 . + .
attribute
gene_id "ENSG00000223972.5"; gene_type "transc...
gene_id "ENSG00000223972.5"; transcript_id "EN...
gene_id "ENSG00000223972.5"; transcript_id "EN...
gene_id "ENSG00000223972.5"; transcript_id "EN...
gene_id "ENSG00000223972.5"; transcript_id "EN...