scmkl.get_gene_groupings

  1import pickle
  2import numpy as np
  3import pandas as pd
  4import gseapy as gp
  5
  6
  7# Prevent iteration of all availible gene sets for human and mouse
  8global_lib_orgs = ['human', 'mouse']
  9
 10human_genesets = [
 11    'Azimuth_2023', 
 12    'Azimuth_Cell_Types_2021', 
 13    'Cancer_Cell_Line_Encyclopedia', 
 14    'CellMarker_2024', 
 15    'CellMarker_Augmented_2021', 
 16    'GO_Biological_Process_2025', 
 17    'GO_Cellular_Component_2025', 
 18    'GO_Molecular_Function_2025', 
 19    'KEGG_2019_Mouse', 
 20    'KEGG_2021_Human', 
 21    'MSigDB_Hallmark_2020', 
 22    'NCI-60_Cancer_Cell_Lines', 
 23    'WikiPathways_2024_Human', 
 24    'WikiPathways_2024_Mouse'
 25]
 26
 27
 28def check_organism(organism: str):
 29    """
 30    Makes sure that organism is in availible organisms from `gseapy`.
 31
 32    Parameters
 33    ----------
 34    organism : str
 35        The organsim to check.
 36
 37    Returns
 38    -------
 39    is_in : bool
 40        `True` if the organism is valid. `False` if organism is not an 
 41        option.
 42    """
 43    org_opts = {'human', 'mouse', 'yeast', 'fly', 'fish', 'worm'}
 44    org_err = f"Invalid `organism`, choose from {org_opts}"
 45    assert organism.lower() in org_opts, org_err
 46
 47    return None
 48
 49
 50def check_groups(groups: list, key_terms: str | list='', 
 51                 blacklist: str | list | bool=False, other_org: str=''):
 52    """
 53    Takes a list of groups from a gene set library and checks the names 
 54    for the desired gene sets. Returns a dictionary with keys 
 55    `'key_terms_in'` that is `list` of `bool`s corresponding to 
 56    `names`. `'num_groups'` value is an int for how many groups are in 
 57    the library.
 58
 59    Parameters
 60    ----------
 61    groups : list
 62        The names of groups for a given library.
 63
 64    key_terms : str | list
 65        The types of cells or other specifiers the gene set is for 
 66        (example: 'CD4 T', 'kidney', ect...).
 67
 68    other_org : str
 69        Either `'human'` or `'mouse'`. The organsim to ignore when 
 70        checking groups. Should be empty if target organism is not 
 71        `'human'` or `'mouse'`.
 72
 73    Returns
 74    -------
 75    result : dict
 76        A dictionary with the following keys and values:
 77
 78        `'name'` : list
 79            The names of each group in input `'groups'` if the name 
 80            does not contain `'other_org'`.
 81    
 82        `'key_terms_in'` : list
 83            A boolean list repective to `'name'` indicating if at 
 84            least one key word from `'key_terms'` is present in the group name.
 85
 86        `'num_groups'` : int
 87            The length of `result['name']`.
 88    """
 89    result = {
 90        'key_terms_in' : list(),
 91        'blacklist_in' : list(),
 92        'name' : list(),
 93    }
 94
 95    for group_name in groups:
 96            
 97        if not other_org in group_name.lower():
 98
 99            if list == type(key_terms):
100                key_terms_in = any([k.lower() in group_name.lower() 
101                                    for k in key_terms])
102            else:
103                key_terms_in = key_terms.lower() in group_name.lower()
104
105            if list == type(blacklist):
106                blacklist_in = any([bl.lower() in group_name.lower() 
107                                    for bl in blacklist])
108            elif str == type(blacklist):
109                blacklist_in = blacklist.lower() in group_name.lower()
110            else:
111                blacklist_in = False
112                
113            result['key_terms_in'].append(key_terms_in)
114            result['blacklist_in'].append(blacklist_in)
115            result['name'].append(group_name)
116
117    result['num_groups'] = len(result['name'])
118
119    return result
120
121
122def check_libs(libs, key_terms: str | list='', 
123               blacklist: str | list | bool=False, other_org: str=''):
124    """
125    Checks libraries for desired `key_terms` in groups.
126
127    Parameters
128    ----------
129    libs : dict
130        A dictionary as `libs[library_name] = library_groups`.
131
132    key_terms : str | list
133        A `str` or `list` of `str`s to seach for in `libs` group names.
134
135    other_org : str
136        Only applicable when desired organism is `'human'` or 
137        `'mouse'`. If desired organism is `'human'`, `other_org` 
138        should be `'mouse'` and vice-versa.
139
140    Returns
141    -------
142    summary, tally : pd.DataFrame | pd.DataFrame
143        `summary` has cols `['Library', 'No. Gene Sets', 
144        'No. Key Terms Matching']` where `'Library'` is the library from 
145        `gseapy` with `'No. Gene Sets'` and `'No. Key Terms Matching'` 
146        corresponding. `'No. Key Terms Matching'` only included if 
147        `key_terms` argument is provided. `tally` has cols `['library', 
148        'key_terms_in', 'name']` 
149    """
150    num_groups = dict()
151    
152    tally = {
153        'library' : list(), 
154        'key_terms_in' : list(), 
155        'blacklist_in' : list(),
156        'name' : list()
157    }
158    
159    for library, groups in libs.items():
160        res = check_groups(list(groups.keys()), key_terms, 
161                           blacklist, other_org)
162
163        lib_repeats = [library]*len(res['name'])
164
165        tally['library'].extend(lib_repeats)
166        tally['key_terms_in'].extend(res['key_terms_in'])
167        tally['blacklist_in'].extend(res['blacklist_in'])
168        tally['name'].extend(res['name'])
169
170        num_groups[library] = res['num_groups']
171
172    tally = pd.DataFrame(tally)
173
174    key_dict = tally.copy()
175    key_dict = key_dict.groupby('library')['key_terms_in'].sum().reset_index()
176    key_dict = dict(zip(key_dict['library'], key_dict['key_terms_in']))
177
178    bl_dict = tally.copy()
179    bl_dict = tally.groupby('library')['blacklist_in'].sum().reset_index()
180    bl_dict = dict(zip(bl_dict['library'], bl_dict['blacklist_in']))
181
182    lib_names = np.unique(tally['library'])
183    key_counts = [key_dict[l] for l in lib_names]
184    bl_counts = [bl_dict[l] for l in lib_names]
185    n_groups = [num_groups[l] for l in lib_names]
186
187    summary = {
188        'Library' : lib_names,
189        'No. Gene Sets' : n_groups
190    }
191
192    if key_terms:
193        summary['No. Key Terms Matching'] = key_counts
194    if blacklist:
195        summary['No. Blacklist Matching'] = bl_counts
196
197    summary = pd.DataFrame(summary)
198
199    return summary, tally
200
201
202def find_candidates(organism: str='human', key_terms: str | list='', blacklist: str | list | bool=False):
203    """
204    Given `organism` and `key_terms`, will search for gene 
205    groupings that could fit the datasets/classification task. 
206    `blacklist` terms undesired in group names.
207
208    Parameters
209    ----------
210    organism : str
211        The species the gene grouping is for. Options are 
212        `{'Human', 'Mouse', 'Yeast', 'Fly', 'Fish', 'Worm'}`
213
214    key_terms : str | list
215        The types of cells or other specifiers the gene set is for 
216        (example: 'CD4 T').
217
218    blacklist : str | list | bool
219        Term(s) undesired in group names. Ignored unless provided.
220
221    Returns
222    -------
223    libraries : list
224        A list of gene set library names that could serve for the 
225        dataset/classification task.        
226
227    Examples
228    --------
229    >>> scmkl.find_candidates('human', key_terms=' b ')
230                                Library  No. Gene Sets
231    0                    Azimuth_2023           1241
232    1         Azimuth_Cell_Types_2021            341
233    2   Cancer_Cell_Line_Encyclopedia            967
234    3                 CellMarker_2024           1134
235    No. Key Type Matching
236    9
237    9
238    0
239    21
240    """
241    check_organism(organism)
242
243    if organism.lower() in global_lib_orgs:
244        glo = global_lib_orgs.copy()
245        glo.remove(organism)
246        other_org = glo[0]
247        libs = human_genesets
248        libs = [name for name in libs if not other_org in name.lower()]
249    else:
250        libs = gp.get_library_name(organism)
251        other_org = ''
252
253    libs = {name : gp.get_library(name, organism)
254            for name in libs}
255    
256    libs_df, _ = check_libs(libs, key_terms, blacklist, other_org)
257
258    return libs_df
259
260
261def get_gene_groupings(lib_name: str, organism: str='human', key_terms: str | list='', 
262                       blacklist: str | list | bool=False, min_overlap: int=2,
263                      genes: list | tuple | pd.Series | np.ndarray | set=[]):
264    """
265    Takes a gene set library name and filters to groups containing 
266    element(s) in `key_terms`. If genes is provided, will 
267    ensure that there are at least `min_overlap` number of genes in 
268    each group. Resulting groups will meet all of the before-mentioned 
269    criteria if `isin_logic` is `'and'` | `'or'`.
270
271    Parameters
272    ----------
273    lib_name : str
274        The desired library name.
275
276    organism : str
277        The species the gene grouping is for. Options are 
278        `{'Human', 'Mouse', 'Yeast', 'Fly', 'Fish', 'Worm'}`.
279
280    key_terms : str | list
281        The types of cells or other specifiers the gene set is for 
282        (example: 'CD4 T').
283
284    genes : array_like
285        A vector of genes from the reference/query datasets. If not 
286        assigned, function will not filter groups based on feature 
287        overlap.
288
289    min_overlap : int
290        The minimum number of genes that must be present in a group 
291        for it to be kept. If `genes` is not given, ignored.
292
293    Returns
294    -------
295    lib : dict
296        The filtered library as a `dict` where keys are group names 
297        and keys are features.
298
299    Examples
300    --------
301    >>> dataset_feats = [
302    ...    'FUCA1', 'CLIC4', 'STMN1', 'SYF2', 'TAS1R1', 
303    ...    'NOL9', 'TAS1R3', 'SLC2A5', 'THAP3', 'IGHM', 
304    ...    'MARCKS', 'BANK1', 'TNFRSF13B', 'IGKC', 'IGHD', 
305    ...    'LINC01857', 'CD24', 'CD37', 'IGHD', 'RALGPS2'
306    ...    ]
307    >>> rna_grouping = scmkl.get_gene_groupings(
308    ...   'Azimuth_2023', key_terms=[' b ', 'b cell', 'b '], 
309    ...   genes=dataset_feats)
310    >>>
311    >>> rna_groupings.keys()
312    dict_keys(['PBMC-L1-B Cell', 'PBMC-L2-Intermediate B Cell', ...])
313    """
314    check_organism(organism)
315    
316    lib = gp.get_library(lib_name, organism)
317
318    if organism.lower() in global_lib_orgs:
319        glo = global_lib_orgs.copy()
320        glo.remove(organism)
321        other_org = glo[0]
322    else:
323        other_org = ''
324
325    group_names = list(lib.keys())
326    res = check_groups(group_names, key_terms, blacklist, other_org)
327    del res['num_groups']
328
329    # Finding groups where group name matches key_terms
330    g_summary = pd.DataFrame(res)
331
332    if key_terms:
333        kept = g_summary['key_terms_in']
334        kept_groups = g_summary['name'][kept].to_numpy()
335        g_summary = g_summary[kept]
336    else:
337        print("Not filtering with `key_terms` parameter.")
338        kept_groups = g_summary['name'].to_numpy()
339
340    if blacklist:
341        kept = ~g_summary['blacklist_in']
342        kept_groups = g_summary['name'][kept].to_numpy()
343    else:
344        print("Not filtering with `blacklist` parameter.")
345    
346    # Filtering library
347    lib = {group : lib[group] for group in kept_groups}
348
349    if 0 < len(genes):
350        del_groups = list()
351        genes = list(set(genes.copy()))
352        for group, features in lib.items():
353            overlap = np.isin(features, genes)
354            overlap = np.sum(overlap)
355            if overlap < min_overlap:
356                print(overlap, flush=True)
357                del_groups.append(group)
358
359        # Removing genes without enough overlap
360        for group in del_groups:
361            print(f'Removing {group} from grouping.')
362            del lib[group]
363
364    else:
365        print("Not checking overlap between group and dataset features.")
366
367    return lib
global_lib_orgs = ['human', 'mouse']
human_genesets = ['Azimuth_2023', 'Azimuth_Cell_Types_2021', 'Cancer_Cell_Line_Encyclopedia', 'CellMarker_2024', 'CellMarker_Augmented_2021', 'GO_Biological_Process_2025', 'GO_Cellular_Component_2025', 'GO_Molecular_Function_2025', 'KEGG_2019_Mouse', 'KEGG_2021_Human', 'MSigDB_Hallmark_2020', 'NCI-60_Cancer_Cell_Lines', 'WikiPathways_2024_Human', 'WikiPathways_2024_Mouse']
def check_organism(organism: str):
29def check_organism(organism: str):
30    """
31    Makes sure that organism is in availible organisms from `gseapy`.
32
33    Parameters
34    ----------
35    organism : str
36        The organsim to check.
37
38    Returns
39    -------
40    is_in : bool
41        `True` if the organism is valid. `False` if organism is not an 
42        option.
43    """
44    org_opts = {'human', 'mouse', 'yeast', 'fly', 'fish', 'worm'}
45    org_err = f"Invalid `organism`, choose from {org_opts}"
46    assert organism.lower() in org_opts, org_err
47
48    return None

Makes sure that organism is in availible organisms from gseapy.

Parameters
  • organism (str): The organsim to check.
Returns
  • is_in (bool): True if the organism is valid. False if organism is not an option.
def check_groups( groups: list, key_terms: str | list = '', blacklist: str | list | bool = False, other_org: str = ''):
 51def check_groups(groups: list, key_terms: str | list='', 
 52                 blacklist: str | list | bool=False, other_org: str=''):
 53    """
 54    Takes a list of groups from a gene set library and checks the names 
 55    for the desired gene sets. Returns a dictionary with keys 
 56    `'key_terms_in'` that is `list` of `bool`s corresponding to 
 57    `names`. `'num_groups'` value is an int for how many groups are in 
 58    the library.
 59
 60    Parameters
 61    ----------
 62    groups : list
 63        The names of groups for a given library.
 64
 65    key_terms : str | list
 66        The types of cells or other specifiers the gene set is for 
 67        (example: 'CD4 T', 'kidney', ect...).
 68
 69    other_org : str
 70        Either `'human'` or `'mouse'`. The organsim to ignore when 
 71        checking groups. Should be empty if target organism is not 
 72        `'human'` or `'mouse'`.
 73
 74    Returns
 75    -------
 76    result : dict
 77        A dictionary with the following keys and values:
 78
 79        `'name'` : list
 80            The names of each group in input `'groups'` if the name 
 81            does not contain `'other_org'`.
 82    
 83        `'key_terms_in'` : list
 84            A boolean list repective to `'name'` indicating if at 
 85            least one key word from `'key_terms'` is present in the group name.
 86
 87        `'num_groups'` : int
 88            The length of `result['name']`.
 89    """
 90    result = {
 91        'key_terms_in' : list(),
 92        'blacklist_in' : list(),
 93        'name' : list(),
 94    }
 95
 96    for group_name in groups:
 97            
 98        if not other_org in group_name.lower():
 99
100            if list == type(key_terms):
101                key_terms_in = any([k.lower() in group_name.lower() 
102                                    for k in key_terms])
103            else:
104                key_terms_in = key_terms.lower() in group_name.lower()
105
106            if list == type(blacklist):
107                blacklist_in = any([bl.lower() in group_name.lower() 
108                                    for bl in blacklist])
109            elif str == type(blacklist):
110                blacklist_in = blacklist.lower() in group_name.lower()
111            else:
112                blacklist_in = False
113                
114            result['key_terms_in'].append(key_terms_in)
115            result['blacklist_in'].append(blacklist_in)
116            result['name'].append(group_name)
117
118    result['num_groups'] = len(result['name'])
119
120    return result

Takes a list of groups from a gene set library and checks the names for the desired gene sets. Returns a dictionary with keys 'key_terms_in' that is list of bools corresponding to names. 'num_groups' value is an int for how many groups are in the library.

Parameters
  • groups (list): The names of groups for a given library.
  • key_terms (str | list): The types of cells or other specifiers the gene set is for (example: 'CD4 T', 'kidney', ect...).
  • other_org (str): Either 'human' or 'mouse'. The organsim to ignore when checking groups. Should be empty if target organism is not 'human' or 'mouse'.
Returns
  • result (dict): A dictionary with the following keys and values:

    'name' : list The names of each group in input 'groups' if the name does not contain 'other_org'.

    'key_terms_in' : list A boolean list repective to 'name' indicating if at least one key word from 'key_terms' is present in the group name.

    'num_groups' : int The length of result['name'].

def check_libs( libs, key_terms: str | list = '', blacklist: str | list | bool = False, other_org: str = ''):
123def check_libs(libs, key_terms: str | list='', 
124               blacklist: str | list | bool=False, other_org: str=''):
125    """
126    Checks libraries for desired `key_terms` in groups.
127
128    Parameters
129    ----------
130    libs : dict
131        A dictionary as `libs[library_name] = library_groups`.
132
133    key_terms : str | list
134        A `str` or `list` of `str`s to seach for in `libs` group names.
135
136    other_org : str
137        Only applicable when desired organism is `'human'` or 
138        `'mouse'`. If desired organism is `'human'`, `other_org` 
139        should be `'mouse'` and vice-versa.
140
141    Returns
142    -------
143    summary, tally : pd.DataFrame | pd.DataFrame
144        `summary` has cols `['Library', 'No. Gene Sets', 
145        'No. Key Terms Matching']` where `'Library'` is the library from 
146        `gseapy` with `'No. Gene Sets'` and `'No. Key Terms Matching'` 
147        corresponding. `'No. Key Terms Matching'` only included if 
148        `key_terms` argument is provided. `tally` has cols `['library', 
149        'key_terms_in', 'name']` 
150    """
151    num_groups = dict()
152    
153    tally = {
154        'library' : list(), 
155        'key_terms_in' : list(), 
156        'blacklist_in' : list(),
157        'name' : list()
158    }
159    
160    for library, groups in libs.items():
161        res = check_groups(list(groups.keys()), key_terms, 
162                           blacklist, other_org)
163
164        lib_repeats = [library]*len(res['name'])
165
166        tally['library'].extend(lib_repeats)
167        tally['key_terms_in'].extend(res['key_terms_in'])
168        tally['blacklist_in'].extend(res['blacklist_in'])
169        tally['name'].extend(res['name'])
170
171        num_groups[library] = res['num_groups']
172
173    tally = pd.DataFrame(tally)
174
175    key_dict = tally.copy()
176    key_dict = key_dict.groupby('library')['key_terms_in'].sum().reset_index()
177    key_dict = dict(zip(key_dict['library'], key_dict['key_terms_in']))
178
179    bl_dict = tally.copy()
180    bl_dict = tally.groupby('library')['blacklist_in'].sum().reset_index()
181    bl_dict = dict(zip(bl_dict['library'], bl_dict['blacklist_in']))
182
183    lib_names = np.unique(tally['library'])
184    key_counts = [key_dict[l] for l in lib_names]
185    bl_counts = [bl_dict[l] for l in lib_names]
186    n_groups = [num_groups[l] for l in lib_names]
187
188    summary = {
189        'Library' : lib_names,
190        'No. Gene Sets' : n_groups
191    }
192
193    if key_terms:
194        summary['No. Key Terms Matching'] = key_counts
195    if blacklist:
196        summary['No. Blacklist Matching'] = bl_counts
197
198    summary = pd.DataFrame(summary)
199
200    return summary, tally

Checks libraries for desired key_terms in groups.

Parameters
  • libs (dict): A dictionary as libs[library_name] = library_groups.
  • key_terms (str | list): A str or list of strs to seach for in libs group names.
  • other_org (str): Only applicable when desired organism is 'human' or 'mouse'. If desired organism is 'human', other_org should be 'mouse' and vice-versa.
Returns
  • summary, tally (pd.DataFrame | pd.DataFrame): summary has cols ['Library', 'No. Gene Sets', 'No. Key Terms Matching'] where 'Library' is the library from gseapy with 'No. Gene Sets' and 'No. Key Terms Matching' corresponding. 'No. Key Terms Matching' only included if key_terms argument is provided. tally has cols ['library', 'key_terms_in', 'name']
def find_candidates( organism: str = 'human', key_terms: str | list = '', blacklist: str | list | bool = False):
203def find_candidates(organism: str='human', key_terms: str | list='', blacklist: str | list | bool=False):
204    """
205    Given `organism` and `key_terms`, will search for gene 
206    groupings that could fit the datasets/classification task. 
207    `blacklist` terms undesired in group names.
208
209    Parameters
210    ----------
211    organism : str
212        The species the gene grouping is for. Options are 
213        `{'Human', 'Mouse', 'Yeast', 'Fly', 'Fish', 'Worm'}`
214
215    key_terms : str | list
216        The types of cells or other specifiers the gene set is for 
217        (example: 'CD4 T').
218
219    blacklist : str | list | bool
220        Term(s) undesired in group names. Ignored unless provided.
221
222    Returns
223    -------
224    libraries : list
225        A list of gene set library names that could serve for the 
226        dataset/classification task.        
227
228    Examples
229    --------
230    >>> scmkl.find_candidates('human', key_terms=' b ')
231                                Library  No. Gene Sets
232    0                    Azimuth_2023           1241
233    1         Azimuth_Cell_Types_2021            341
234    2   Cancer_Cell_Line_Encyclopedia            967
235    3                 CellMarker_2024           1134
236    No. Key Type Matching
237    9
238    9
239    0
240    21
241    """
242    check_organism(organism)
243
244    if organism.lower() in global_lib_orgs:
245        glo = global_lib_orgs.copy()
246        glo.remove(organism)
247        other_org = glo[0]
248        libs = human_genesets
249        libs = [name for name in libs if not other_org in name.lower()]
250    else:
251        libs = gp.get_library_name(organism)
252        other_org = ''
253
254    libs = {name : gp.get_library(name, organism)
255            for name in libs}
256    
257    libs_df, _ = check_libs(libs, key_terms, blacklist, other_org)
258
259    return libs_df

Given organism and key_terms, will search for gene groupings that could fit the datasets/classification task. blacklist terms undesired in group names.

Parameters
  • organism (str): The species the gene grouping is for. Options are {'Human', 'Mouse', 'Yeast', 'Fly', 'Fish', 'Worm'}
  • key_terms (str | list): The types of cells or other specifiers the gene set is for (example: 'CD4 T').
  • blacklist (str | list | bool): Term(s) undesired in group names. Ignored unless provided.
Returns
  • libraries (list): A list of gene set library names that could serve for the dataset/classification task.
Examples
>>> scmkl.find_candidates('human', key_terms=' b ')
                            Library  No. Gene Sets
0                    Azimuth_2023           1241
1         Azimuth_Cell_Types_2021            341
2   Cancer_Cell_Line_Encyclopedia            967
3                 CellMarker_2024           1134
No. Key Type Matching
9
9
0
21
def get_gene_groupings( lib_name: str, organism: str = 'human', key_terms: str | list = '', blacklist: str | list | bool = False, min_overlap: int = 2, genes: list | tuple | pandas.core.series.Series | numpy.ndarray | set = []):
262def get_gene_groupings(lib_name: str, organism: str='human', key_terms: str | list='', 
263                       blacklist: str | list | bool=False, min_overlap: int=2,
264                      genes: list | tuple | pd.Series | np.ndarray | set=[]):
265    """
266    Takes a gene set library name and filters to groups containing 
267    element(s) in `key_terms`. If genes is provided, will 
268    ensure that there are at least `min_overlap` number of genes in 
269    each group. Resulting groups will meet all of the before-mentioned 
270    criteria if `isin_logic` is `'and'` | `'or'`.
271
272    Parameters
273    ----------
274    lib_name : str
275        The desired library name.
276
277    organism : str
278        The species the gene grouping is for. Options are 
279        `{'Human', 'Mouse', 'Yeast', 'Fly', 'Fish', 'Worm'}`.
280
281    key_terms : str | list
282        The types of cells or other specifiers the gene set is for 
283        (example: 'CD4 T').
284
285    genes : array_like
286        A vector of genes from the reference/query datasets. If not 
287        assigned, function will not filter groups based on feature 
288        overlap.
289
290    min_overlap : int
291        The minimum number of genes that must be present in a group 
292        for it to be kept. If `genes` is not given, ignored.
293
294    Returns
295    -------
296    lib : dict
297        The filtered library as a `dict` where keys are group names 
298        and keys are features.
299
300    Examples
301    --------
302    >>> dataset_feats = [
303    ...    'FUCA1', 'CLIC4', 'STMN1', 'SYF2', 'TAS1R1', 
304    ...    'NOL9', 'TAS1R3', 'SLC2A5', 'THAP3', 'IGHM', 
305    ...    'MARCKS', 'BANK1', 'TNFRSF13B', 'IGKC', 'IGHD', 
306    ...    'LINC01857', 'CD24', 'CD37', 'IGHD', 'RALGPS2'
307    ...    ]
308    >>> rna_grouping = scmkl.get_gene_groupings(
309    ...   'Azimuth_2023', key_terms=[' b ', 'b cell', 'b '], 
310    ...   genes=dataset_feats)
311    >>>
312    >>> rna_groupings.keys()
313    dict_keys(['PBMC-L1-B Cell', 'PBMC-L2-Intermediate B Cell', ...])
314    """
315    check_organism(organism)
316    
317    lib = gp.get_library(lib_name, organism)
318
319    if organism.lower() in global_lib_orgs:
320        glo = global_lib_orgs.copy()
321        glo.remove(organism)
322        other_org = glo[0]
323    else:
324        other_org = ''
325
326    group_names = list(lib.keys())
327    res = check_groups(group_names, key_terms, blacklist, other_org)
328    del res['num_groups']
329
330    # Finding groups where group name matches key_terms
331    g_summary = pd.DataFrame(res)
332
333    if key_terms:
334        kept = g_summary['key_terms_in']
335        kept_groups = g_summary['name'][kept].to_numpy()
336        g_summary = g_summary[kept]
337    else:
338        print("Not filtering with `key_terms` parameter.")
339        kept_groups = g_summary['name'].to_numpy()
340
341    if blacklist:
342        kept = ~g_summary['blacklist_in']
343        kept_groups = g_summary['name'][kept].to_numpy()
344    else:
345        print("Not filtering with `blacklist` parameter.")
346    
347    # Filtering library
348    lib = {group : lib[group] for group in kept_groups}
349
350    if 0 < len(genes):
351        del_groups = list()
352        genes = list(set(genes.copy()))
353        for group, features in lib.items():
354            overlap = np.isin(features, genes)
355            overlap = np.sum(overlap)
356            if overlap < min_overlap:
357                print(overlap, flush=True)
358                del_groups.append(group)
359
360        # Removing genes without enough overlap
361        for group in del_groups:
362            print(f'Removing {group} from grouping.')
363            del lib[group]
364
365    else:
366        print("Not checking overlap between group and dataset features.")
367
368    return lib

Takes a gene set library name and filters to groups containing element(s) in key_terms. If genes is provided, will ensure that there are at least min_overlap number of genes in each group. Resulting groups will meet all of the before-mentioned criteria if isin_logic is 'and' | 'or'.

Parameters
  • lib_name (str): The desired library name.
  • organism (str): The species the gene grouping is for. Options are {'Human', 'Mouse', 'Yeast', 'Fly', 'Fish', 'Worm'}.
  • key_terms (str | list): The types of cells or other specifiers the gene set is for (example: 'CD4 T').
  • genes (array_like): A vector of genes from the reference/query datasets. If not assigned, function will not filter groups based on feature overlap.
  • min_overlap (int): The minimum number of genes that must be present in a group for it to be kept. If genes is not given, ignored.
Returns
  • lib (dict): The filtered library as a dict where keys are group names and keys are features.
Examples
>>> dataset_feats = [
...    'FUCA1', 'CLIC4', 'STMN1', 'SYF2', 'TAS1R1', 
...    'NOL9', 'TAS1R3', 'SLC2A5', 'THAP3', 'IGHM', 
...    'MARCKS', 'BANK1', 'TNFRSF13B', 'IGKC', 'IGHD', 
...    'LINC01857', 'CD24', 'CD37', 'IGHD', 'RALGPS2'
...    ]
>>> rna_grouping = scmkl.get_gene_groupings(
...   'Azimuth_2023', key_terms=[' b ', 'b cell', 'b '], 
...   genes=dataset_feats)
>>>
>>> rna_groupings.keys()
dict_keys(['PBMC-L1-B Cell', 'PBMC-L2-Intermediate B Cell', ...])