scmkl.dataframes

  1import os
  2import re
  3import numpy as np
  4import pandas as pd
  5
  6
  7def _parse_result_type(results: dict):
  8    """
  9    Returns whether or not there are multiple results per class.
 10
 11    Parameters
 12    ----------
 13    results : dict
 14        Either the output of `scmkl.run()` or `scmkl.one_v_rest()` or 
 15        a dictionary of those results.
 16
 17    Returns
 18    -------
 19    is_mult, is_many : bool, bool
 20        If `is_mult` is `True`, then results are multiclass. If 
 21        `is_many` is `True`, results contain multiple outputs.
 22
 23    """
 24    # Single result cases
 25    if 'Classes' in results.keys():
 26        is_mult = True
 27        is_many = False
 28        return is_mult, is_many
 29    elif 'Norms' in results.keys():
 30        is_mult = False
 31        is_many = False
 32        return is_mult, is_many
 33
 34    # Multiresult cases
 35    keys = list(results.keys())
 36    if 'Classes' in results[keys[0]].keys():
 37        is_mult = True
 38        is_many = True
 39        return is_mult, is_many
 40    elif 'Norms' in results[keys[0]].keys():
 41        is_mult = False
 42        is_many = True
 43        return is_mult, is_many
 44    else:
 45        print("Unknown result structure", flush=True)
 46
 47
 48def sort_groups(df: pd.DataFrame, group_col: str='Group', 
 49                norm_col: str='Kernel Weight'):
 50    """
 51    Takes a dataframe with `group_col` and returns sorted group list 
 52    with groups in decending order by their weights. Assumes there is 
 53    one instance of each group.
 54
 55    Parameters
 56    ----------
 57    df : pd.DataFrame
 58        A dataframe with `group_col` and `norm_col` to be sorted by.
 59
 60    group_col : str
 61        The column containing the group names.
 62
 63    norm_col : str
 64        The column containing the kernel weights.
 65
 66    Returns
 67    -------
 68    group_order : list
 69        A list of groups in descending order according to their kernel 
 70        weights.
 71
 72    Examples
 73    --------
 74    >>> result = scmkl.run(adata, alpha_list)
 75    >>> weights = scmkl.get_weights(result)
 76    >>> group_order = scmkl.sort_groups(weights, 'Group', 
 77    ...                                 'Kernel Weight')
 78    >>>
 79    >>> group_order
 80    ['HALLMARK_ESTROGEN_RESPONSE_EARLY', 'HALLM...', ...]
 81    """
 82    df = df.copy()
 83    df = df.sort_values(norm_col, ascending=False)
 84    group_order = list(df[group_col])
 85
 86    return group_order
 87
 88
 89def format_group_names(group_names: list | pd.Series | np.ndarray, 
 90                       rm_words: list=list()):
 91    """
 92    Takes an ArrayLike object of group names and formats them.
 93
 94    Parameters
 95    ----------
 96    group_names : array_like
 97        An array of group names to format.
 98
 99    rm_words : list
100        Words to remove from all group names.
101
102    Returns
103    -------
104    new_group_names : list
105        Formatted version of the input group names.
106
107    Examples
108    --------
109    >>> groups = ['HALLMARK_E2F_TARGETS', 'HALLMARK_HYPOXIA']
110    >>> new_groups = scmkl.format_group_names(groups)
111    >>> new_groups
112    ['Hallmark E2F Targets', 'Hallmark Hypoxia']
113    """
114    new_group_names = list()
115    rm_words = [word.lower() for word in rm_words]
116
117    for name in group_names:
118        new_name = list()
119        for word in re.split(r'_|\s', name):
120            if word.isalpha() and (len(word) > 3):
121                word = word.capitalize()
122            if word.lower() not in rm_words:
123                new_name.append(word)
124        new_name = ' '.join(new_name)
125        new_group_names.append(new_name)
126
127    return new_group_names
128        
129
130def parse_metrics(results: dict, key: str | None=None, 
131                   include_as: bool=False) -> pd.DataFrame:
132    """
133    This function returns a pd.DataFrame for a single scMKL result 
134    with performance results.
135
136    Parameters
137    ----------
138    results : dict
139        A result dictionary from `scmkl.run()`.
140    
141    key : str
142        If specified, will add a key column to the output dataframe 
143        where each element is `key`.
144
145    include_as : bool
146        If `True`, will add a column indicating which models' used 
147        the optimal alphas.
148
149    Returns
150    -------
151    df : pd.DataFrame
152        A dataframe with columns `['Alpha', 'Metric', 'Value']`. 
153        `'Key'` col only added if `key` is not `None`.
154    """
155    df = {
156        'Alpha' : list(),
157        'Metric' : list(),
158        'Value' : list()
159    }
160
161    # Check if is a multiclass result
162    is_mult, _ = _parse_result_type(results)
163
164    if is_mult:
165        df['Class'] = list()
166
167    # Ensuring results is a scMKL result and checking multiclass
168    if 'Metrics' in results.keys():
169        for alpha in results['Metrics'].keys():
170            for metric, value in results['Metrics'][alpha].items():
171                df['Alpha'].append(alpha)
172                df['Metric'].append(metric)
173                df['Value'].append(value)
174
175    elif 'Classes' in results.keys():
176        for ct in results['Classes']:
177            for alpha in results[ct]['Metrics'].keys():
178                for metric, value in results[ct]['Metrics'][alpha].items():
179                    df['Alpha'].append(alpha)
180                    df['Metric'].append(metric)
181                    df['Value'].append(value)
182                    df['Class'].append(ct)
183
184    else:
185        print(f"{key} is not a scMKL result and will be ignored.")
186            
187    df = pd.DataFrame(df)
188    
189    if include_as:
190        assert 'Alpha_star' in results.keys(), "'Alpha_star' not in results"
191        df['Alpha Star'] = df['Alpha'] == results['Alpha_star']
192
193    if key is not None:
194        df['Key'] = [key] * df.shape[0]
195
196    return df        
197
198
199def parse_weights(results: dict, include_as: bool=False, 
200                   key: None | str=None) -> pd.DataFrame:
201    """
202    This function returns a pd.DataFrame for a single scMKL result 
203    with group weights.
204
205    Parameters
206    ----------
207    results : dict
208        A result dictionary from `scmkl.run()`.
209    
210    key : str
211        If specified, will add a key column to the output dataframe 
212        where each element is `key`.
213
214    include_as : bool
215        If `True`, will add a column indicating which models' used 
216        the optimal alphas.
217
218    Returns
219    -------
220    df : pd.DataFrame
221        A dataframe with columns `['Alpha', 'Group', 
222        'Kernel Weight']`. `'Key'` col only added if `key` is not 
223        `None`.
224    """
225    df = {
226        'Alpha' : list(),
227        'Group' : list(),
228        'Kernel Weight' : list()
229    }
230
231    # Check if is a multiclass result
232    is_mult, _ = _parse_result_type(results)
233
234    if is_mult:
235        df['Class'] = list()
236
237    # Ensuring results is a scMKL result and checking multiclass
238    if 'Norms' in results.keys():
239        for alpha in results['Norms'].keys():
240            df['Alpha'].extend([alpha]*len(results['Norms'][alpha]))
241            df['Group'].extend(results['Group_names'])
242            df['Kernel Weight'].extend(results['Norms'][alpha])
243
244    elif 'Classes' in results.keys():
245        for ct in results['Classes']:
246            for alpha in results[ct]['Norms'].keys():
247                df['Alpha'].extend([alpha] * len(results[ct]['Norms'][alpha]))
248                df['Group'].extend(results[ct]['Group_names'])
249                df['Kernel Weight'].extend(results[ct]['Norms'][alpha])
250                df['Class'].extend([ct]*len(results[ct]['Norms'][alpha]))
251
252    df = pd.DataFrame(df)
253    
254    if include_as:
255        df['Alpha Star'] = df['Alpha'] == results['Alpha_star'] 
256
257    if key is not None:
258        df['Key'] = [key] * df.shape[0]
259
260    return df
261
262
263def extract_results(results: dict, metric: str):
264    """
265    
266    """
267    summary = {'Alpha' : list(),
268               metric : list(),
269               'Number of Selected Groups' : list(),
270               'Top Group' : list()}
271    
272    alpha_list = list(results['Metrics'].keys())
273
274    # Creating summary DataFrame for each model
275    for alpha in alpha_list:
276        cur_alpha_rows = results['Norms'][alpha]
277        top_weight_rows = np.max(results['Norms'][alpha])
278        top_group_index = np.where(cur_alpha_rows == top_weight_rows)
279        num_selected = len(results['Selected_groups'][alpha])
280        top_group_name = np.array(results['Group_names'])[top_group_index]
281        
282        if 0 == num_selected:
283            top_group_name = ["No groups selected"]
284
285        summary['Alpha'].append(alpha)
286        summary[metric].append(results['Metrics'][alpha][metric])
287        summary['Number of Selected Groups'].append(num_selected)
288        summary['Top Group'].append(*top_group_name)
289    
290    return pd.DataFrame(summary)
291
292
293def get_summary(results: dict, metric: str='AUROC'):
294    """
295    Takes the results from `scmkl.run()` and generates a dataframe 
296    for each model containing columns for alpha, area under the ROC, 
297    number of groups with nonzero weights, and highest weighted 
298    group.
299
300    Parameters
301    ----------
302    results : dict
303        A dictionary of results from scMKL generated from 
304        `scmkl.run()`.
305
306    metric : str
307        Which metric to include in the summary. Default is AUROC. 
308        Options include `'AUROC'`, `'Recall'`, `'Precision'`, 
309        `'Accuracy'`, and `'F1-Score'`.
310
311    Returns
312    -------
313    summary_df : pd.DataFrame
314        A table with columns: `['Alpha', 'AUROC', 
315        'Number of Selected Groups', 'Top Group']`.
316    
317    Examples
318    --------
319    >>> results = scmkl.run(adata, alpha_list)
320    >>> summary_df = scmkl.get_summary(results)
321    ...
322    >>> summary_df.head()
323        Alpha   AUROC  Number of Selected Groups 
324    0   2.20  0.8600                          3   
325    1   1.96  0.9123                          4   
326    2   1.72  0.9357                          5   
327    3   1.48  0.9524                          7   
328    4   1.24  0.9666                          9   
329        Top Group
330    0   RNA-HALLMARK_E2F_TARGETS
331    1   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
332    2   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
333    3   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
334    4   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
335    """
336    is_multi, is_many = _parse_result_type(results)
337    assert not is_many, "This function only supports single results"
338    
339    if is_multi:
340        summaries = list()
341        for ct in results['Classes']:
342            data = extract_results(results[ct], metric)
343            data['Class'] = [ct]*len(data)
344            summaries.append(data.copy())
345        summary = pd.concat(summaries)
346
347    else:
348        summary = extract_results(results, metric)
349
350    return summary
351
352
353def read_files(dir: str, pattern: str | None=None) -> dict:
354    """
355    This function takes a directory of scMKL results as pickle files 
356    and returns a dictionary with the file names as keys and the data 
357    from the respective files as the values.
358
359    Parameters
360    ----------
361    dir : str
362        A string specifying the file path for the output scMKL runs.
363
364    pattern : str
365        A regex string for filtering down to desired files. If 
366        `None`, all files in the directory with the pickle file 
367        extension will be added to the dictionary.
368
369    Returns
370    -------
371    results : dict
372        A dictionary with the file names as keys and data as values.
373
374    Examples
375    --------
376    >>> filepath = 'scMKL_results/rna+atac/'
377    ...
378    >>> all_results = scmkl.read_files(filepath)
379    >>> all_results.keys()
380    dict_keys(['Rep_1.pkl', Rep_2.pkl, Rep_3.pkl, ...])
381    """
382    # Reading all pickle files in patter is None
383    if pattern is None:
384        data = {file : np.load(f'{dir}/{file}', allow_pickle = True)
385                 for file in os.listdir(dir) if '.pkl' in file}
386    
387    # Reading only files matching pattern if not None
388    else:
389        pattern = repr(pattern)
390        data = {file : np.load(f'{dir}/{file}', allow_pickle = True)
391                 for file in os.listdir(dir) 
392                 if re.fullmatch(pattern, file) is not None}
393        
394    return data
395
396
397def get_metrics(results: dict, include_as: bool=False) -> pd.DataFrame:
398    """
399    Takes either a single scMKL result or a dictionary where each 
400    entry cooresponds to one result. Returns a dataframe with cols 
401    ['Alpha', 'Metric', 'Value']. If `include_as == True`, another 
402    col of booleans will be added to indicate whether or not the run 
403    respective to that alpha was chosen as optimal via CV. If 
404    `include_key == True`, another column will be added with the name 
405    of the key to the respective file (only applicable with multiple 
406    results).
407
408    Parameters
409    ----------
410    results : dict | None
411        A dictionary with the results of a single run from 
412        `scmkl.run()`. Must be `None` if `rfiles is not None`.
413
414    rfiles : dict | None
415        A dictionary of results dictionaries containing multiple 
416        results from `scmkl.run()`. 
417
418    include_as : bool
419        When `True`, will add a bool col to output pd.DataFrame 
420        where rows with alphas cooresponding to alpha_star will be 
421        `True`.
422
423    Returns
424    -------
425    df : pd.DataFrame
426        A pd.DataFrame containing all of the metrics present from 
427        the runs input.
428
429    Examples
430    --------
431    >>> # For a single file
432    >>> results = scmkl.run(adata)
433    >>> metrics = scmkl.get_metrics(results = results)
434
435    >>> # For multiple runs saved in a dict
436    >>> output_dir = 'scMKL_outputs/'
437    >>> rfiles = scmkl.read_files(output_dir)
438    >>> metrics = scmkl.get_metrics(rfiles=rfiles)
439    """
440    # Checking which data is being worked with 
441    is_mult, is_many = _parse_result_type(results)
442
443    # Initiating col list with minimal columns
444    cols = ['Alpha', 'Metric', 'Value']
445
446    if include_as:
447        cols.append('Alpha Star')
448    if is_mult:
449        cols.append('Class')
450
451    if is_many:
452        cols.append('Key')
453        df = pd.DataFrame(columns = cols)
454        for key, result in results.items():
455                cur_df = parse_metrics(results = result, key = key, 
456                                        include_as = include_as)
457                df = pd.concat([df, cur_df.copy()])
458            
459    else:
460        df = parse_metrics(results = results, include_as = include_as)
461
462    return df
463
464
465def get_weights(results : dict, include_as : bool = False) -> pd.DataFrame:
466    """
467    Takes either a single scMKL result or dictionary of results and 
468    returns a pd.DataFrame with cols ['Alpha', 'Group', 
469    'Kernel Weight']. If `include_as == True`, a fourth col will be 
470    added to indicate whether or not the run respective to that alpha 
471    was chosen as optimal via cross validation.
472
473    Parameters
474    ----------
475    results : dict | None
476        A dictionary with the results of a single run from 
477        `scmkl.run()`. Must be `None` if `rfiles is not None`.
478
479    rfiles : dict | None
480        A dictionary of results dictionaries containing multiple 
481        results from `scmkl.run()`. 
482
483    include_as : bool
484        When `True`, will add a bool col to output pd.DataFrame 
485        where rows with alphas cooresponding to alpha_star will be 
486        `True`.
487
488    Returns
489    -------
490    df : pd.DataFrame
491        A pd.DataFrame containing all of the groups from each alpha 
492        and their cooresponding kernel weights.
493
494    Examples
495    --------
496    >>> # For a single file
497    >>> results = scmkl.run(adata)
498    >>> weights = scmkl.get_weights(results = results)
499    
500    >>> # For multiple runs saved in a dict
501    >>> output_dir = 'scMKL_outputs/'
502    >>> rfiles = scmkl.read_files(output_dir)
503    >>> weights = scmkl.get_weights(rfiles=rfiles)
504    """
505    # Checking which data is being worked with 
506    is_mult, is_many = _parse_result_type(results)
507
508    # Initiating col list with minimal columns
509    cols = ['Alpha', 'Group', 'Kernel Weight']
510
511    if include_as:
512        cols.append('Alpha Star')
513    if is_mult:
514        cols.append('Class')
515
516    if is_many:
517        cols.append('Key')
518        df = pd.DataFrame(columns = cols)
519        for key, result in results.items():
520            cur_df = parse_weights(results = result, key = key, 
521                                     include_as = include_as)
522            df = pd.concat([df, cur_df.copy()])
523            
524    else:
525        df = parse_weights(results = results, include_as = include_as)
526
527    return df
528
529
530def get_selection(weights_df: pd.DataFrame, 
531                  order_groups: bool=False) -> pd.DataFrame:
532    """
533    This function takes a pd.DataFrame created by 
534    `scmkl.get_weights()` and returns a selection table. Selection 
535    refers to how many times a group had a nonzero group weight. To 
536    calculate this, a col is added indicating whether the group was 
537    selected. Then, the dataframe is grouped by alpha and group. 
538    Selection can then be summed returning a dataframe with cols 
539    `['Alpha', 'Group', Selection]`. If is the result of multiclass 
540    run(s), `'Class'` column must be present and will be in resulting 
541    df as well.
542
543    Parameters
544    ----------
545    weights_df : pd.DataFrame
546        A dataframe output by `scmkl.get_weights()` with cols
547        `['Alpha', 'Group', 'Kernel Weight']`. If is the result of 
548        multiclass run(s), `'Class'` column must be present as well.
549
550    order_groups : bool
551        If `True`, the `'Group'` col of the output dataframe will be 
552        made into a `pd.Categorical` col ordered by number of times 
553        each group was selected in decending order.
554
555    Returns
556    -------
557    df : pd.DataFrame
558        A dataframe with cols `['Alpha', 'Group', Selection]`. Also, 
559        `'Class'` column if is a multiclass result.
560
561    Example
562    -------
563    >>> # For a single file
564    >>> results = scmkl.run(adata)
565    >>> weights = scmkl.get_weights(results = results)
566    >>> selection = scmkl.get_selection(weights)
567    
568    >>> # For multiple runs saved in a dict
569    >>> output_dir = 'scMKL_outputs/'
570    >>> rfiles = scmkl.read_files(output_dir)
571    >>> weights = scmkl.get_weights(rfiles=rfiles)
572    >>> selection = scmkl.get_selection(weights)
573    """
574    # Adding col indicating whether or not groups have nonzero weight
575    selection = weights_df['Kernel Weight'].apply(lambda x: x > 0)
576    weights_df['Selection'] = selection
577
578    # Summing selection across replications to get selection
579    is_mult = 'Class' in weights_df.columns
580    if is_mult:
581        df = weights_df.groupby(['Alpha', 'Group', 'Class'])['Selection'].sum()
582    else:
583        df = weights_df.groupby(['Alpha', 'Group'])['Selection'].sum()
584    df = df.reset_index()
585
586    # Getting group order
587    if order_groups and not is_mult:
588        order = df.groupby('Group')['Selection'].sum()
589        order = order.reset_index().sort_values(by = 'Selection', 
590                                                ascending = False)
591        order = order['Group']
592        df['Group'] = pd.Categorical(df['Group'], categories = order)
593
594
595    return df
596
597
598def groups_per_alpha(selection_df: pd.DataFrame) -> dict:
599    """
600    This function takes a pd.DataFrame from `scmkl.get_selection()` 
601    generated from multiple scMKL results and returns a dictionary 
602    with keys being alphas from the input dataframe and values being 
603    the mean number of selected groups for a given alpha across 
604    results. 
605
606    Parameters
607    ----------
608    selection_df : pd.DataFrame
609        A dataframe output by `scmkl.get_selection()` with cols 
610        `['Alpha', 'Group', Selection].
611    
612    Returns
613    -------
614    mean_groups : dict
615        A dictionary with alphas as keys and the mean number of 
616        selected groups for that alpha as keys.
617
618    Examples
619    --------
620    >>> weights = scmkl.get_weights(rfiles)
621    >>> selection = scmkl.get_selection(weights)
622    >>> mean_groups = scmkl.mean_groups_per_alpha(selection)
623    >>> mean_groups = {alpha : np.round(num_selected, 1)
624    ...                for alpha, num_selected in mean_groups.items()}
625    >>>
626    >>> print(mean_groups)
627    {0.05 : 50.0, 0.2 : 24.7, 1.1 : 5.3}
628    """
629    mean_groups = {}
630    for alpha in np.unique(selection_df['Alpha']):
631
632        # Capturing rows for given alpha
633        rows = selection_df['Alpha'] == alpha
634
635        # Adding mean number of groups for alpha
636        mean_groups[alpha] = np.mean(selection_df[rows]['Selection'])
637
638    return mean_groups
639
640
641def read_gtf(path: str, filter_to_coding: bool=False):
642    """
643    Reads and formats a gtf file. Adds colnames: `['chr', 'source', 
644    'feature', 'start', 'end', 'score', 'strand', 'frame', 
645    'attribute']`.
646
647    Parameters
648    ----------
649    path : str
650        The file path to the gtf file to be read in. If the file is 
651        gzipped, file name must end with .gz.
652
653    filter_to_coding : bool
654        If `True`, will filter rows in gtf data frame to only 
655        protein coding genes. Will add column `'gene_name'` containing 
656        the gene name for each row.
657
658    Returns
659    -------
660    df : pd.DataFrame
661        A pandas dataframe of the input gtf file.
662
663    Examples
664    --------
665    >>> import scmkl
666    >>>
667    >>> file = 'data/hg38_subset_protein_coding.annotation.gtf'
668    >>> gtf = scmkl.read_gtf(file)
669    >>>
670    >>> gtf.head()
671            chr  source     feature  start    end score strand frame                                          
672    0  chr1  HAVANA        gene  11869  14409     .      +     .  
673    1  chr1  HAVANA  transcript  11869  14409     .      +     .  
674    2  chr1  HAVANA        exon  11869  12227     .      +     .  
675    3  chr1  HAVANA        exon  12613  12721     .      +     .  
676    4  chr1  HAVANA        exon  13221  14409     .      +     .  
677    attribute
678    gene_id "ENSG00000223972.5"; gene_type "transc...
679    gene_id "ENSG00000223972.5"; transcript_id "EN...
680    gene_id "ENSG00000223972.5"; transcript_id "EN...
681    gene_id "ENSG00000223972.5"; transcript_id "EN...
682    gene_id "ENSG00000223972.5"; transcript_id "EN...
683    """
684    df = pd.read_csv(path, sep='\t', comment='#', 
685                     skip_blank_lines=True, header=None)
686    
687    df.columns = ['chr', 'source', 'feature', 'start', 'end', 
688                  'score', 'strand', 'frame', 'attribute']
689    
690    if filter_to_coding:
691        prot_rows = df['attribute'].str.contains('protein_coding')
692        df = df[prot_rows]
693        df = df[df['feature'] == 'gene']
694
695        # Capturing and adding gene name to df
696        df['gene_name'] = [re.findall(r'(?<=gene_name ")[A-z0-9]+', 
697                                      attr)[0] 
698                           for attr in df['attribute']]
699    
700    return df
def sort_groups( df: pandas.core.frame.DataFrame, group_col: str = 'Group', norm_col: str = 'Kernel Weight'):
49def sort_groups(df: pd.DataFrame, group_col: str='Group', 
50                norm_col: str='Kernel Weight'):
51    """
52    Takes a dataframe with `group_col` and returns sorted group list 
53    with groups in decending order by their weights. Assumes there is 
54    one instance of each group.
55
56    Parameters
57    ----------
58    df : pd.DataFrame
59        A dataframe with `group_col` and `norm_col` to be sorted by.
60
61    group_col : str
62        The column containing the group names.
63
64    norm_col : str
65        The column containing the kernel weights.
66
67    Returns
68    -------
69    group_order : list
70        A list of groups in descending order according to their kernel 
71        weights.
72
73    Examples
74    --------
75    >>> result = scmkl.run(adata, alpha_list)
76    >>> weights = scmkl.get_weights(result)
77    >>> group_order = scmkl.sort_groups(weights, 'Group', 
78    ...                                 'Kernel Weight')
79    >>>
80    >>> group_order
81    ['HALLMARK_ESTROGEN_RESPONSE_EARLY', 'HALLM...', ...]
82    """
83    df = df.copy()
84    df = df.sort_values(norm_col, ascending=False)
85    group_order = list(df[group_col])
86
87    return group_order

Takes a dataframe with group_col and returns sorted group list with groups in decending order by their weights. Assumes there is one instance of each group.

Parameters
  • df (pd.DataFrame): A dataframe with group_col and norm_col to be sorted by.
  • group_col (str): The column containing the group names.
  • norm_col (str): The column containing the kernel weights.
Returns
  • group_order (list): A list of groups in descending order according to their kernel weights.
Examples
>>> result = scmkl.run(adata, alpha_list)
>>> weights = scmkl.get_weights(result)
>>> group_order = scmkl.sort_groups(weights, 'Group', 
...                                 'Kernel Weight')
>>>
>>> group_order
['HALLMARK_ESTROGEN_RESPONSE_EARLY', 'HALLM...', ...]
def format_group_names( group_names: list | pandas.core.series.Series | numpy.ndarray, rm_words: list = []):
 90def format_group_names(group_names: list | pd.Series | np.ndarray, 
 91                       rm_words: list=list()):
 92    """
 93    Takes an ArrayLike object of group names and formats them.
 94
 95    Parameters
 96    ----------
 97    group_names : array_like
 98        An array of group names to format.
 99
100    rm_words : list
101        Words to remove from all group names.
102
103    Returns
104    -------
105    new_group_names : list
106        Formatted version of the input group names.
107
108    Examples
109    --------
110    >>> groups = ['HALLMARK_E2F_TARGETS', 'HALLMARK_HYPOXIA']
111    >>> new_groups = scmkl.format_group_names(groups)
112    >>> new_groups
113    ['Hallmark E2F Targets', 'Hallmark Hypoxia']
114    """
115    new_group_names = list()
116    rm_words = [word.lower() for word in rm_words]
117
118    for name in group_names:
119        new_name = list()
120        for word in re.split(r'_|\s', name):
121            if word.isalpha() and (len(word) > 3):
122                word = word.capitalize()
123            if word.lower() not in rm_words:
124                new_name.append(word)
125        new_name = ' '.join(new_name)
126        new_group_names.append(new_name)
127
128    return new_group_names

Takes an ArrayLike object of group names and formats them.

Parameters
  • group_names (array_like): An array of group names to format.
  • rm_words (list): Words to remove from all group names.
Returns
  • new_group_names (list): Formatted version of the input group names.
Examples
>>> groups = ['HALLMARK_E2F_TARGETS', 'HALLMARK_HYPOXIA']
>>> new_groups = scmkl.format_group_names(groups)
>>> new_groups
['Hallmark E2F Targets', 'Hallmark Hypoxia']
def parse_metrics( results: dict, key: str | None = None, include_as: bool = False) -> pandas.core.frame.DataFrame:
131def parse_metrics(results: dict, key: str | None=None, 
132                   include_as: bool=False) -> pd.DataFrame:
133    """
134    This function returns a pd.DataFrame for a single scMKL result 
135    with performance results.
136
137    Parameters
138    ----------
139    results : dict
140        A result dictionary from `scmkl.run()`.
141    
142    key : str
143        If specified, will add a key column to the output dataframe 
144        where each element is `key`.
145
146    include_as : bool
147        If `True`, will add a column indicating which models' used 
148        the optimal alphas.
149
150    Returns
151    -------
152    df : pd.DataFrame
153        A dataframe with columns `['Alpha', 'Metric', 'Value']`. 
154        `'Key'` col only added if `key` is not `None`.
155    """
156    df = {
157        'Alpha' : list(),
158        'Metric' : list(),
159        'Value' : list()
160    }
161
162    # Check if is a multiclass result
163    is_mult, _ = _parse_result_type(results)
164
165    if is_mult:
166        df['Class'] = list()
167
168    # Ensuring results is a scMKL result and checking multiclass
169    if 'Metrics' in results.keys():
170        for alpha in results['Metrics'].keys():
171            for metric, value in results['Metrics'][alpha].items():
172                df['Alpha'].append(alpha)
173                df['Metric'].append(metric)
174                df['Value'].append(value)
175
176    elif 'Classes' in results.keys():
177        for ct in results['Classes']:
178            for alpha in results[ct]['Metrics'].keys():
179                for metric, value in results[ct]['Metrics'][alpha].items():
180                    df['Alpha'].append(alpha)
181                    df['Metric'].append(metric)
182                    df['Value'].append(value)
183                    df['Class'].append(ct)
184
185    else:
186        print(f"{key} is not a scMKL result and will be ignored.")
187            
188    df = pd.DataFrame(df)
189    
190    if include_as:
191        assert 'Alpha_star' in results.keys(), "'Alpha_star' not in results"
192        df['Alpha Star'] = df['Alpha'] == results['Alpha_star']
193
194    if key is not None:
195        df['Key'] = [key] * df.shape[0]
196
197    return df        

This function returns a pd.DataFrame for a single scMKL result with performance results.

Parameters
  • results (dict): A result dictionary from scmkl.run.
  • key (str): If specified, will add a key column to the output dataframe where each element is key.
  • include_as (bool): If True, will add a column indicating which models' used the optimal alphas.
Returns
  • df (pd.DataFrame): A dataframe with columns ['Alpha', 'Metric', 'Value']. 'Key' col only added if key is not None.
def parse_weights( results: dict, include_as: bool = False, key: None | str = None) -> pandas.core.frame.DataFrame:
200def parse_weights(results: dict, include_as: bool=False, 
201                   key: None | str=None) -> pd.DataFrame:
202    """
203    This function returns a pd.DataFrame for a single scMKL result 
204    with group weights.
205
206    Parameters
207    ----------
208    results : dict
209        A result dictionary from `scmkl.run()`.
210    
211    key : str
212        If specified, will add a key column to the output dataframe 
213        where each element is `key`.
214
215    include_as : bool
216        If `True`, will add a column indicating which models' used 
217        the optimal alphas.
218
219    Returns
220    -------
221    df : pd.DataFrame
222        A dataframe with columns `['Alpha', 'Group', 
223        'Kernel Weight']`. `'Key'` col only added if `key` is not 
224        `None`.
225    """
226    df = {
227        'Alpha' : list(),
228        'Group' : list(),
229        'Kernel Weight' : list()
230    }
231
232    # Check if is a multiclass result
233    is_mult, _ = _parse_result_type(results)
234
235    if is_mult:
236        df['Class'] = list()
237
238    # Ensuring results is a scMKL result and checking multiclass
239    if 'Norms' in results.keys():
240        for alpha in results['Norms'].keys():
241            df['Alpha'].extend([alpha]*len(results['Norms'][alpha]))
242            df['Group'].extend(results['Group_names'])
243            df['Kernel Weight'].extend(results['Norms'][alpha])
244
245    elif 'Classes' in results.keys():
246        for ct in results['Classes']:
247            for alpha in results[ct]['Norms'].keys():
248                df['Alpha'].extend([alpha] * len(results[ct]['Norms'][alpha]))
249                df['Group'].extend(results[ct]['Group_names'])
250                df['Kernel Weight'].extend(results[ct]['Norms'][alpha])
251                df['Class'].extend([ct]*len(results[ct]['Norms'][alpha]))
252
253    df = pd.DataFrame(df)
254    
255    if include_as:
256        df['Alpha Star'] = df['Alpha'] == results['Alpha_star'] 
257
258    if key is not None:
259        df['Key'] = [key] * df.shape[0]
260
261    return df

This function returns a pd.DataFrame for a single scMKL result with group weights.

Parameters
  • results (dict): A result dictionary from scmkl.run.
  • key (str): If specified, will add a key column to the output dataframe where each element is key.
  • include_as (bool): If True, will add a column indicating which models' used the optimal alphas.
Returns
  • df (pd.DataFrame): A dataframe with columns ['Alpha', 'Group', 'Kernel Weight']. 'Key' col only added if key is not None.
def extract_results(results: dict, metric: str):
264def extract_results(results: dict, metric: str):
265    """
266    
267    """
268    summary = {'Alpha' : list(),
269               metric : list(),
270               'Number of Selected Groups' : list(),
271               'Top Group' : list()}
272    
273    alpha_list = list(results['Metrics'].keys())
274
275    # Creating summary DataFrame for each model
276    for alpha in alpha_list:
277        cur_alpha_rows = results['Norms'][alpha]
278        top_weight_rows = np.max(results['Norms'][alpha])
279        top_group_index = np.where(cur_alpha_rows == top_weight_rows)
280        num_selected = len(results['Selected_groups'][alpha])
281        top_group_name = np.array(results['Group_names'])[top_group_index]
282        
283        if 0 == num_selected:
284            top_group_name = ["No groups selected"]
285
286        summary['Alpha'].append(alpha)
287        summary[metric].append(results['Metrics'][alpha][metric])
288        summary['Number of Selected Groups'].append(num_selected)
289        summary['Top Group'].append(*top_group_name)
290    
291    return pd.DataFrame(summary)
def get_summary(results: dict, metric: str = 'AUROC'):
294def get_summary(results: dict, metric: str='AUROC'):
295    """
296    Takes the results from `scmkl.run()` and generates a dataframe 
297    for each model containing columns for alpha, area under the ROC, 
298    number of groups with nonzero weights, and highest weighted 
299    group.
300
301    Parameters
302    ----------
303    results : dict
304        A dictionary of results from scMKL generated from 
305        `scmkl.run()`.
306
307    metric : str
308        Which metric to include in the summary. Default is AUROC. 
309        Options include `'AUROC'`, `'Recall'`, `'Precision'`, 
310        `'Accuracy'`, and `'F1-Score'`.
311
312    Returns
313    -------
314    summary_df : pd.DataFrame
315        A table with columns: `['Alpha', 'AUROC', 
316        'Number of Selected Groups', 'Top Group']`.
317    
318    Examples
319    --------
320    >>> results = scmkl.run(adata, alpha_list)
321    >>> summary_df = scmkl.get_summary(results)
322    ...
323    >>> summary_df.head()
324        Alpha   AUROC  Number of Selected Groups 
325    0   2.20  0.8600                          3   
326    1   1.96  0.9123                          4   
327    2   1.72  0.9357                          5   
328    3   1.48  0.9524                          7   
329    4   1.24  0.9666                          9   
330        Top Group
331    0   RNA-HALLMARK_E2F_TARGETS
332    1   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
333    2   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
334    3   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
335    4   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
336    """
337    is_multi, is_many = _parse_result_type(results)
338    assert not is_many, "This function only supports single results"
339    
340    if is_multi:
341        summaries = list()
342        for ct in results['Classes']:
343            data = extract_results(results[ct], metric)
344            data['Class'] = [ct]*len(data)
345            summaries.append(data.copy())
346        summary = pd.concat(summaries)
347
348    else:
349        summary = extract_results(results, metric)
350
351    return summary

Takes the results from scmkl.run and generates a dataframe for each model containing columns for alpha, area under the ROC, number of groups with nonzero weights, and highest weighted group.

Parameters
  • results (dict): A dictionary of results from scMKL generated from scmkl.run.
  • metric (str): Which metric to include in the summary. Default is AUROC. Options include 'AUROC', 'Recall', 'Precision', 'Accuracy', and 'F1-Score'.
Returns
  • summary_df (pd.DataFrame): A table with columns: ['Alpha', 'AUROC', 'Number of Selected Groups', 'Top Group'].
Examples
>>> results = scmkl.run(adata, alpha_list)
>>> summary_df = scmkl.get_summary(results)
...
>>> summary_df.head()
    Alpha   AUROC  Number of Selected Groups 
0   2.20  0.8600                          3   
1   1.96  0.9123                          4   
2   1.72  0.9357                          5   
3   1.48  0.9524                          7   
4   1.24  0.9666                          9   
    Top Group
0   RNA-HALLMARK_E2F_TARGETS
1   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
2   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
3   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
4   RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
def read_files(dir: str, pattern: str | None = None) -> dict:
354def read_files(dir: str, pattern: str | None=None) -> dict:
355    """
356    This function takes a directory of scMKL results as pickle files 
357    and returns a dictionary with the file names as keys and the data 
358    from the respective files as the values.
359
360    Parameters
361    ----------
362    dir : str
363        A string specifying the file path for the output scMKL runs.
364
365    pattern : str
366        A regex string for filtering down to desired files. If 
367        `None`, all files in the directory with the pickle file 
368        extension will be added to the dictionary.
369
370    Returns
371    -------
372    results : dict
373        A dictionary with the file names as keys and data as values.
374
375    Examples
376    --------
377    >>> filepath = 'scMKL_results/rna+atac/'
378    ...
379    >>> all_results = scmkl.read_files(filepath)
380    >>> all_results.keys()
381    dict_keys(['Rep_1.pkl', Rep_2.pkl, Rep_3.pkl, ...])
382    """
383    # Reading all pickle files in patter is None
384    if pattern is None:
385        data = {file : np.load(f'{dir}/{file}', allow_pickle = True)
386                 for file in os.listdir(dir) if '.pkl' in file}
387    
388    # Reading only files matching pattern if not None
389    else:
390        pattern = repr(pattern)
391        data = {file : np.load(f'{dir}/{file}', allow_pickle = True)
392                 for file in os.listdir(dir) 
393                 if re.fullmatch(pattern, file) is not None}
394        
395    return data

This function takes a directory of scMKL results as pickle files and returns a dictionary with the file names as keys and the data from the respective files as the values.

Parameters
  • dir (str): A string specifying the file path for the output scMKL runs.
  • pattern (str): A regex string for filtering down to desired files. If None, all files in the directory with the pickle file extension will be added to the dictionary.
Returns
  • results (dict): A dictionary with the file names as keys and data as values.
Examples
>>> filepath = 'scMKL_results/rna+atac/'
...
>>> all_results = scmkl.read_files(filepath)
>>> all_results.keys()
dict_keys(['Rep_1.pkl', Rep_2.pkl, Rep_3.pkl, ...])
def get_metrics(results: dict, include_as: bool = False) -> pandas.core.frame.DataFrame:
398def get_metrics(results: dict, include_as: bool=False) -> pd.DataFrame:
399    """
400    Takes either a single scMKL result or a dictionary where each 
401    entry cooresponds to one result. Returns a dataframe with cols 
402    ['Alpha', 'Metric', 'Value']. If `include_as == True`, another 
403    col of booleans will be added to indicate whether or not the run 
404    respective to that alpha was chosen as optimal via CV. If 
405    `include_key == True`, another column will be added with the name 
406    of the key to the respective file (only applicable with multiple 
407    results).
408
409    Parameters
410    ----------
411    results : dict | None
412        A dictionary with the results of a single run from 
413        `scmkl.run()`. Must be `None` if `rfiles is not None`.
414
415    rfiles : dict | None
416        A dictionary of results dictionaries containing multiple 
417        results from `scmkl.run()`. 
418
419    include_as : bool
420        When `True`, will add a bool col to output pd.DataFrame 
421        where rows with alphas cooresponding to alpha_star will be 
422        `True`.
423
424    Returns
425    -------
426    df : pd.DataFrame
427        A pd.DataFrame containing all of the metrics present from 
428        the runs input.
429
430    Examples
431    --------
432    >>> # For a single file
433    >>> results = scmkl.run(adata)
434    >>> metrics = scmkl.get_metrics(results = results)
435
436    >>> # For multiple runs saved in a dict
437    >>> output_dir = 'scMKL_outputs/'
438    >>> rfiles = scmkl.read_files(output_dir)
439    >>> metrics = scmkl.get_metrics(rfiles=rfiles)
440    """
441    # Checking which data is being worked with 
442    is_mult, is_many = _parse_result_type(results)
443
444    # Initiating col list with minimal columns
445    cols = ['Alpha', 'Metric', 'Value']
446
447    if include_as:
448        cols.append('Alpha Star')
449    if is_mult:
450        cols.append('Class')
451
452    if is_many:
453        cols.append('Key')
454        df = pd.DataFrame(columns = cols)
455        for key, result in results.items():
456                cur_df = parse_metrics(results = result, key = key, 
457                                        include_as = include_as)
458                df = pd.concat([df, cur_df.copy()])
459            
460    else:
461        df = parse_metrics(results = results, include_as = include_as)
462
463    return df

Takes either a single scMKL result or a dictionary where each entry cooresponds to one result. Returns a dataframe with cols ['Alpha', 'Metric', 'Value']. If include_as == True, another col of booleans will be added to indicate whether or not the run respective to that alpha was chosen as optimal via CV. If include_key == True, another column will be added with the name of the key to the respective file (only applicable with multiple results).

Parameters
  • results (dict | None): A dictionary with the results of a single run from scmkl.run. Must be None if rfiles is not None.
  • rfiles (dict | None): A dictionary of results dictionaries containing multiple results from scmkl.run.
  • include_as (bool): When True, will add a bool col to output pd.DataFrame where rows with alphas cooresponding to alpha_star will be True.
Returns
  • df (pd.DataFrame): A pd.DataFrame containing all of the metrics present from the runs input.
Examples
>>> # For a single file
>>> results = scmkl.run(adata)
>>> metrics = scmkl.get_metrics(results = results)
>>> # For multiple runs saved in a dict
>>> output_dir = 'scMKL_outputs/'
>>> rfiles = scmkl.read_files(output_dir)
>>> metrics = scmkl.get_metrics(rfiles=rfiles)
def get_weights(results: dict, include_as: bool = False) -> pandas.core.frame.DataFrame:
466def get_weights(results : dict, include_as : bool = False) -> pd.DataFrame:
467    """
468    Takes either a single scMKL result or dictionary of results and 
469    returns a pd.DataFrame with cols ['Alpha', 'Group', 
470    'Kernel Weight']. If `include_as == True`, a fourth col will be 
471    added to indicate whether or not the run respective to that alpha 
472    was chosen as optimal via cross validation.
473
474    Parameters
475    ----------
476    results : dict | None
477        A dictionary with the results of a single run from 
478        `scmkl.run()`. Must be `None` if `rfiles is not None`.
479
480    rfiles : dict | None
481        A dictionary of results dictionaries containing multiple 
482        results from `scmkl.run()`. 
483
484    include_as : bool
485        When `True`, will add a bool col to output pd.DataFrame 
486        where rows with alphas cooresponding to alpha_star will be 
487        `True`.
488
489    Returns
490    -------
491    df : pd.DataFrame
492        A pd.DataFrame containing all of the groups from each alpha 
493        and their cooresponding kernel weights.
494
495    Examples
496    --------
497    >>> # For a single file
498    >>> results = scmkl.run(adata)
499    >>> weights = scmkl.get_weights(results = results)
500    
501    >>> # For multiple runs saved in a dict
502    >>> output_dir = 'scMKL_outputs/'
503    >>> rfiles = scmkl.read_files(output_dir)
504    >>> weights = scmkl.get_weights(rfiles=rfiles)
505    """
506    # Checking which data is being worked with 
507    is_mult, is_many = _parse_result_type(results)
508
509    # Initiating col list with minimal columns
510    cols = ['Alpha', 'Group', 'Kernel Weight']
511
512    if include_as:
513        cols.append('Alpha Star')
514    if is_mult:
515        cols.append('Class')
516
517    if is_many:
518        cols.append('Key')
519        df = pd.DataFrame(columns = cols)
520        for key, result in results.items():
521            cur_df = parse_weights(results = result, key = key, 
522                                     include_as = include_as)
523            df = pd.concat([df, cur_df.copy()])
524            
525    else:
526        df = parse_weights(results = results, include_as = include_as)
527
528    return df

Takes either a single scMKL result or dictionary of results and returns a pd.DataFrame with cols ['Alpha', 'Group', 'Kernel Weight']. If include_as == True, a fourth col will be added to indicate whether or not the run respective to that alpha was chosen as optimal via cross validation.

Parameters
  • results (dict | None): A dictionary with the results of a single run from scmkl.run. Must be None if rfiles is not None.
  • rfiles (dict | None): A dictionary of results dictionaries containing multiple results from scmkl.run.
  • include_as (bool): When True, will add a bool col to output pd.DataFrame where rows with alphas cooresponding to alpha_star will be True.
Returns
  • df (pd.DataFrame): A pd.DataFrame containing all of the groups from each alpha and their cooresponding kernel weights.
Examples
>>> # For a single file
>>> results = scmkl.run(adata)
>>> weights = scmkl.get_weights(results = results)
>>> # For multiple runs saved in a dict
>>> output_dir = 'scMKL_outputs/'
>>> rfiles = scmkl.read_files(output_dir)
>>> weights = scmkl.get_weights(rfiles=rfiles)
def get_selection( weights_df: pandas.core.frame.DataFrame, order_groups: bool = False) -> pandas.core.frame.DataFrame:
531def get_selection(weights_df: pd.DataFrame, 
532                  order_groups: bool=False) -> pd.DataFrame:
533    """
534    This function takes a pd.DataFrame created by 
535    `scmkl.get_weights()` and returns a selection table. Selection 
536    refers to how many times a group had a nonzero group weight. To 
537    calculate this, a col is added indicating whether the group was 
538    selected. Then, the dataframe is grouped by alpha and group. 
539    Selection can then be summed returning a dataframe with cols 
540    `['Alpha', 'Group', Selection]`. If is the result of multiclass 
541    run(s), `'Class'` column must be present and will be in resulting 
542    df as well.
543
544    Parameters
545    ----------
546    weights_df : pd.DataFrame
547        A dataframe output by `scmkl.get_weights()` with cols
548        `['Alpha', 'Group', 'Kernel Weight']`. If is the result of 
549        multiclass run(s), `'Class'` column must be present as well.
550
551    order_groups : bool
552        If `True`, the `'Group'` col of the output dataframe will be 
553        made into a `pd.Categorical` col ordered by number of times 
554        each group was selected in decending order.
555
556    Returns
557    -------
558    df : pd.DataFrame
559        A dataframe with cols `['Alpha', 'Group', Selection]`. Also, 
560        `'Class'` column if is a multiclass result.
561
562    Example
563    -------
564    >>> # For a single file
565    >>> results = scmkl.run(adata)
566    >>> weights = scmkl.get_weights(results = results)
567    >>> selection = scmkl.get_selection(weights)
568    
569    >>> # For multiple runs saved in a dict
570    >>> output_dir = 'scMKL_outputs/'
571    >>> rfiles = scmkl.read_files(output_dir)
572    >>> weights = scmkl.get_weights(rfiles=rfiles)
573    >>> selection = scmkl.get_selection(weights)
574    """
575    # Adding col indicating whether or not groups have nonzero weight
576    selection = weights_df['Kernel Weight'].apply(lambda x: x > 0)
577    weights_df['Selection'] = selection
578
579    # Summing selection across replications to get selection
580    is_mult = 'Class' in weights_df.columns
581    if is_mult:
582        df = weights_df.groupby(['Alpha', 'Group', 'Class'])['Selection'].sum()
583    else:
584        df = weights_df.groupby(['Alpha', 'Group'])['Selection'].sum()
585    df = df.reset_index()
586
587    # Getting group order
588    if order_groups and not is_mult:
589        order = df.groupby('Group')['Selection'].sum()
590        order = order.reset_index().sort_values(by = 'Selection', 
591                                                ascending = False)
592        order = order['Group']
593        df['Group'] = pd.Categorical(df['Group'], categories = order)
594
595
596    return df

This function takes a pd.DataFrame created by scmkl.get_weights() and returns a selection table. Selection refers to how many times a group had a nonzero group weight. To calculate this, a col is added indicating whether the group was selected. Then, the dataframe is grouped by alpha and group. Selection can then be summed returning a dataframe with cols ['Alpha', 'Group', Selection]. If is the result of multiclass run(s), 'Class' column must be present and will be in resulting df as well.

Parameters
  • weights_df (pd.DataFrame): A dataframe output by scmkl.get_weights() with cols ['Alpha', 'Group', 'Kernel Weight']. If is the result of multiclass run(s), 'Class' column must be present as well.
  • order_groups (bool): If True, the 'Group' col of the output dataframe will be made into a pd.Categorical col ordered by number of times each group was selected in decending order.
Returns
  • df (pd.DataFrame): A dataframe with cols ['Alpha', 'Group', Selection]. Also, 'Class' column if is a multiclass result.
Example
>>> # For a single file
>>> results = scmkl.run(adata)
>>> weights = scmkl.get_weights(results = results)
>>> selection = scmkl.get_selection(weights)
>>> # For multiple runs saved in a dict
>>> output_dir = 'scMKL_outputs/'
>>> rfiles = scmkl.read_files(output_dir)
>>> weights = scmkl.get_weights(rfiles=rfiles)
>>> selection = scmkl.get_selection(weights)
def groups_per_alpha(selection_df: pandas.core.frame.DataFrame) -> dict:
599def groups_per_alpha(selection_df: pd.DataFrame) -> dict:
600    """
601    This function takes a pd.DataFrame from `scmkl.get_selection()` 
602    generated from multiple scMKL results and returns a dictionary 
603    with keys being alphas from the input dataframe and values being 
604    the mean number of selected groups for a given alpha across 
605    results. 
606
607    Parameters
608    ----------
609    selection_df : pd.DataFrame
610        A dataframe output by `scmkl.get_selection()` with cols 
611        `['Alpha', 'Group', Selection].
612    
613    Returns
614    -------
615    mean_groups : dict
616        A dictionary with alphas as keys and the mean number of 
617        selected groups for that alpha as keys.
618
619    Examples
620    --------
621    >>> weights = scmkl.get_weights(rfiles)
622    >>> selection = scmkl.get_selection(weights)
623    >>> mean_groups = scmkl.mean_groups_per_alpha(selection)
624    >>> mean_groups = {alpha : np.round(num_selected, 1)
625    ...                for alpha, num_selected in mean_groups.items()}
626    >>>
627    >>> print(mean_groups)
628    {0.05 : 50.0, 0.2 : 24.7, 1.1 : 5.3}
629    """
630    mean_groups = {}
631    for alpha in np.unique(selection_df['Alpha']):
632
633        # Capturing rows for given alpha
634        rows = selection_df['Alpha'] == alpha
635
636        # Adding mean number of groups for alpha
637        mean_groups[alpha] = np.mean(selection_df[rows]['Selection'])
638
639    return mean_groups

This function takes a pd.DataFrame from scmkl.get_selection() generated from multiple scMKL results and returns a dictionary with keys being alphas from the input dataframe and values being the mean number of selected groups for a given alpha across results.

Parameters
  • selection_df (pd.DataFrame): A dataframe output by scmkl.get_selection() with cols `['Alpha', 'Group', Selection].
Returns
  • mean_groups (dict): A dictionary with alphas as keys and the mean number of selected groups for that alpha as keys.
Examples
>>> weights = scmkl.get_weights(rfiles)
>>> selection = scmkl.get_selection(weights)
>>> mean_groups = scmkl.mean_groups_per_alpha(selection)
>>> mean_groups = {alpha : np.round(num_selected, 1)
...                for alpha, num_selected in mean_groups.items()}
>>>
>>> print(mean_groups)
{0.05 : 50.0, 0.2 : 24.7, 1.1 : 5.3}
def read_gtf(path: str, filter_to_coding: bool = False):
642def read_gtf(path: str, filter_to_coding: bool=False):
643    """
644    Reads and formats a gtf file. Adds colnames: `['chr', 'source', 
645    'feature', 'start', 'end', 'score', 'strand', 'frame', 
646    'attribute']`.
647
648    Parameters
649    ----------
650    path : str
651        The file path to the gtf file to be read in. If the file is 
652        gzipped, file name must end with .gz.
653
654    filter_to_coding : bool
655        If `True`, will filter rows in gtf data frame to only 
656        protein coding genes. Will add column `'gene_name'` containing 
657        the gene name for each row.
658
659    Returns
660    -------
661    df : pd.DataFrame
662        A pandas dataframe of the input gtf file.
663
664    Examples
665    --------
666    >>> import scmkl
667    >>>
668    >>> file = 'data/hg38_subset_protein_coding.annotation.gtf'
669    >>> gtf = scmkl.read_gtf(file)
670    >>>
671    >>> gtf.head()
672            chr  source     feature  start    end score strand frame                                          
673    0  chr1  HAVANA        gene  11869  14409     .      +     .  
674    1  chr1  HAVANA  transcript  11869  14409     .      +     .  
675    2  chr1  HAVANA        exon  11869  12227     .      +     .  
676    3  chr1  HAVANA        exon  12613  12721     .      +     .  
677    4  chr1  HAVANA        exon  13221  14409     .      +     .  
678    attribute
679    gene_id "ENSG00000223972.5"; gene_type "transc...
680    gene_id "ENSG00000223972.5"; transcript_id "EN...
681    gene_id "ENSG00000223972.5"; transcript_id "EN...
682    gene_id "ENSG00000223972.5"; transcript_id "EN...
683    gene_id "ENSG00000223972.5"; transcript_id "EN...
684    """
685    df = pd.read_csv(path, sep='\t', comment='#', 
686                     skip_blank_lines=True, header=None)
687    
688    df.columns = ['chr', 'source', 'feature', 'start', 'end', 
689                  'score', 'strand', 'frame', 'attribute']
690    
691    if filter_to_coding:
692        prot_rows = df['attribute'].str.contains('protein_coding')
693        df = df[prot_rows]
694        df = df[df['feature'] == 'gene']
695
696        # Capturing and adding gene name to df
697        df['gene_name'] = [re.findall(r'(?<=gene_name ")[A-z0-9]+', 
698                                      attr)[0] 
699                           for attr in df['attribute']]
700    
701    return df

Reads and formats a gtf file. Adds colnames: ['chr', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute'].

Parameters
  • path (str): The file path to the gtf file to be read in. If the file is gzipped, file name must end with .gz.
  • filter_to_coding (bool): If True, will filter rows in gtf data frame to only protein coding genes. Will add column 'gene_name' containing the gene name for each row.
Returns
  • df (pd.DataFrame): A pandas dataframe of the input gtf file.
Examples
>>> import scmkl
>>>
>>> file = 'data/hg38_subset_protein_coding.annotation.gtf'
>>> gtf = scmkl.read_gtf(file)
>>>
>>> gtf.head()
        chr  source     feature  start    end score strand frame                                          
0  chr1  HAVANA        gene  11869  14409     .      +     .  
1  chr1  HAVANA  transcript  11869  14409     .      +     .  
2  chr1  HAVANA        exon  11869  12227     .      +     .  
3  chr1  HAVANA        exon  12613  12721     .      +     .  
4  chr1  HAVANA        exon  13221  14409     .      +     .  
attribute
gene_id "ENSG00000223972.5"; gene_type "transc...
gene_id "ENSG00000223972.5"; transcript_id "EN...
gene_id "ENSG00000223972.5"; transcript_id "EN...
gene_id "ENSG00000223972.5"; transcript_id "EN...
gene_id "ENSG00000223972.5"; transcript_id "EN...