scmkl.get_gene_groupings
1import pickle 2import numpy as np 3import pandas as pd 4import gseapy as gp 5 6 7# Prevent iteration of all availible gene sets for human and mouse 8global_lib_orgs = ['human', 'mouse'] 9 10human_genesets = [ 11 'Azimuth_2023', 12 'Azimuth_Cell_Types_2021', 13 'Cancer_Cell_Line_Encyclopedia', 14 'CellMarker_2024', 15 'CellMarker_Augmented_2021', 16 'GO_Biological_Process_2025', 17 'GO_Cellular_Component_2025', 18 'GO_Molecular_Function_2025', 19 'KEGG_2019_Mouse', 20 'KEGG_2021_Human', 21 'MSigDB_Hallmark_2020', 22 'NCI-60_Cancer_Cell_Lines', 23 'WikiPathways_2024_Human', 24 'WikiPathways_2024_Mouse' 25] 26 27 28def check_organism(organism: str): 29 """ 30 Makes sure that organism is in availible organisms from `gseapy`. 31 32 Parameters 33 ---------- 34 organism : str 35 The organsim to check. 36 37 Returns 38 ------- 39 is_in : bool 40 `True` if the organism is valid. `False` if organism is not an 41 option. 42 """ 43 org_opts = {'human', 'mouse', 'yeast', 'fly', 'fish', 'worm'} 44 org_err = f"Invalid `organism`, choose from {org_opts}" 45 assert organism.lower() in org_opts, org_err 46 47 return None 48 49 50def check_groups(groups: list, key_terms: str | list='', 51 blacklist: str | list | bool=False, other_org: str=''): 52 """ 53 Takes a list of groups from a gene set library and checks the names 54 for the desired gene sets. Returns a dictionary with keys 55 `'key_terms_in'` that is `list` of `bool`s corresponding to 56 `names`. `'num_groups'` value is an int for how many groups are in 57 the library. 58 59 Parameters 60 ---------- 61 groups : list 62 The names of groups for a given library. 63 64 key_terms : str | list 65 The types of cells or other specifiers the gene set is for 66 (example: 'CD4 T', 'kidney', ect...). 67 68 other_org : str 69 Either `'human'` or `'mouse'`. The organsim to ignore when 70 checking groups. Should be empty if target organism is not 71 `'human'` or `'mouse'`. 72 73 Returns 74 ------- 75 result : dict 76 A dictionary with the following keys and values: 77 78 `'name'` : list 79 The names of each group in input `'groups'` if the name 80 does not contain `'other_org'`. 81 82 `'key_terms_in'` : list 83 A boolean list repective to `'name'` indicating if at 84 least one key word from `'key_terms'` is present in the group name. 85 86 `'num_groups'` : int 87 The length of `result['name']`. 88 """ 89 result = { 90 'key_terms_in' : list(), 91 'blacklist_in' : list(), 92 'name' : list(), 93 } 94 95 for group_name in groups: 96 97 if not other_org in group_name.lower(): 98 99 if list == type(key_terms): 100 key_terms_in = any([k.lower() in group_name.lower() 101 for k in key_terms]) 102 else: 103 key_terms_in = key_terms.lower() in group_name.lower() 104 105 if list == type(blacklist): 106 blacklist_in = any([bl.lower() in group_name.lower() 107 for bl in blacklist]) 108 elif str == type(blacklist): 109 blacklist_in = blacklist.lower() in group_name.lower() 110 else: 111 blacklist_in = False 112 113 result['key_terms_in'].append(key_terms_in) 114 result['blacklist_in'].append(blacklist_in) 115 result['name'].append(group_name) 116 117 result['num_groups'] = len(result['name']) 118 119 return result 120 121 122def check_libs(libs, key_terms: str | list='', 123 blacklist: str | list | bool=False, other_org: str=''): 124 """ 125 Checks libraries for desired `key_terms` in groups. 126 127 Parameters 128 ---------- 129 libs : dict 130 A dictionary as `libs[library_name] = library_groups`. 131 132 key_terms : str | list 133 A `str` or `list` of `str`s to seach for in `libs` group names. 134 135 other_org : str 136 Only applicable when desired organism is `'human'` or 137 `'mouse'`. If desired organism is `'human'`, `other_org` 138 should be `'mouse'` and vice-versa. 139 140 Returns 141 ------- 142 summary, tally : pd.DataFrame | pd.DataFrame 143 `summary` has cols `['Library', 'No. Gene Sets', 144 'No. Key Terms Matching']` where `'Library'` is the library from 145 `gseapy` with `'No. Gene Sets'` and `'No. Key Terms Matching'` 146 corresponding. `'No. Key Terms Matching'` only included if 147 `key_terms` argument is provided. `tally` has cols `['library', 148 'key_terms_in', 'name']` 149 """ 150 num_groups = dict() 151 152 tally = { 153 'library' : list(), 154 'key_terms_in' : list(), 155 'blacklist_in' : list(), 156 'name' : list() 157 } 158 159 for library, groups in libs.items(): 160 res = check_groups(list(groups.keys()), key_terms, 161 blacklist, other_org) 162 163 lib_repeats = [library]*len(res['name']) 164 165 tally['library'].extend(lib_repeats) 166 tally['key_terms_in'].extend(res['key_terms_in']) 167 tally['blacklist_in'].extend(res['blacklist_in']) 168 tally['name'].extend(res['name']) 169 170 num_groups[library] = res['num_groups'] 171 172 tally = pd.DataFrame(tally) 173 174 key_dict = tally.copy() 175 key_dict = key_dict.groupby('library')['key_terms_in'].sum().reset_index() 176 key_dict = dict(zip(key_dict['library'], key_dict['key_terms_in'])) 177 178 bl_dict = tally.copy() 179 bl_dict = tally.groupby('library')['blacklist_in'].sum().reset_index() 180 bl_dict = dict(zip(bl_dict['library'], bl_dict['blacklist_in'])) 181 182 lib_names = np.unique(tally['library']) 183 key_counts = [key_dict[l] for l in lib_names] 184 bl_counts = [bl_dict[l] for l in lib_names] 185 n_groups = [num_groups[l] for l in lib_names] 186 187 summary = { 188 'Library' : lib_names, 189 'No. Gene Sets' : n_groups 190 } 191 192 if key_terms: 193 summary['No. Key Terms Matching'] = key_counts 194 if blacklist: 195 summary['No. Blacklist Matching'] = bl_counts 196 197 summary = pd.DataFrame(summary) 198 199 return summary, tally 200 201 202def find_candidates(organism: str='human', key_terms: str | list='', blacklist: str | list | bool=False): 203 """ 204 Given `organism` and `key_terms`, will search for gene 205 groupings that could fit the datasets/classification task. 206 `blacklist` terms undesired in group names. 207 208 Parameters 209 ---------- 210 organism : str 211 The species the gene grouping is for. Options are 212 `{'Human', 'Mouse', 'Yeast', 'Fly', 'Fish', 'Worm'}` 213 214 key_terms : str | list 215 The types of cells or other specifiers the gene set is for 216 (example: 'CD4 T'). 217 218 blacklist : str | list | bool 219 Term(s) undesired in group names. Ignored unless provided. 220 221 Returns 222 ------- 223 libraries : list 224 A list of gene set library names that could serve for the 225 dataset/classification task. 226 227 Examples 228 -------- 229 >>> scmkl.find_candidates('human', key_terms=' b ') 230 Library No. Gene Sets 231 0 Azimuth_2023 1241 232 1 Azimuth_Cell_Types_2021 341 233 2 Cancer_Cell_Line_Encyclopedia 967 234 3 CellMarker_2024 1134 235 No. Key Type Matching 236 9 237 9 238 0 239 21 240 """ 241 check_organism(organism) 242 243 if organism.lower() in global_lib_orgs: 244 glo = global_lib_orgs.copy() 245 glo.remove(organism) 246 other_org = glo[0] 247 libs = human_genesets 248 libs = [name for name in libs if not other_org in name.lower()] 249 else: 250 libs = gp.get_library_name(organism) 251 other_org = '' 252 253 libs = {name : gp.get_library(name, organism) 254 for name in libs} 255 256 libs_df, _ = check_libs(libs, key_terms, blacklist, other_org) 257 258 return libs_df 259 260 261def get_gene_groupings(lib_name: str, organism: str='human', key_terms: str | list='', 262 blacklist: str | list | bool=False, min_overlap: int=2, 263 genes: list | tuple | pd.Series | np.ndarray | set=[]): 264 """ 265 Takes a gene set library name and filters to groups containing 266 element(s) in `key_terms`. If genes is provided, will 267 ensure that there are at least `min_overlap` number of genes in 268 each group. Resulting groups will meet all of the before-mentioned 269 criteria if `isin_logic` is `'and'` | `'or'`. 270 271 Parameters 272 ---------- 273 lib_name : str 274 The desired library name. 275 276 organism : str 277 The species the gene grouping is for. Options are 278 `{'Human', 'Mouse', 'Yeast', 'Fly', 'Fish', 'Worm'}`. 279 280 key_terms : str | list 281 The types of cells or other specifiers the gene set is for 282 (example: 'CD4 T'). 283 284 genes : array_like 285 A vector of genes from the reference/query datasets. If not 286 assigned, function will not filter groups based on feature 287 overlap. 288 289 min_overlap : int 290 The minimum number of genes that must be present in a group 291 for it to be kept. If `genes` is not given, ignored. 292 293 Returns 294 ------- 295 lib : dict 296 The filtered library as a `dict` where keys are group names 297 and keys are features. 298 299 Examples 300 -------- 301 >>> dataset_feats = [ 302 ... 'FUCA1', 'CLIC4', 'STMN1', 'SYF2', 'TAS1R1', 303 ... 'NOL9', 'TAS1R3', 'SLC2A5', 'THAP3', 'IGHM', 304 ... 'MARCKS', 'BANK1', 'TNFRSF13B', 'IGKC', 'IGHD', 305 ... 'LINC01857', 'CD24', 'CD37', 'IGHD', 'RALGPS2' 306 ... ] 307 >>> rna_grouping = scmkl.get_gene_groupings( 308 ... 'Azimuth_2023', key_terms=[' b ', 'b cell', 'b '], 309 ... genes=dataset_feats) 310 >>> 311 >>> rna_groupings.keys() 312 dict_keys(['PBMC-L1-B Cell', 'PBMC-L2-Intermediate B Cell', ...]) 313 """ 314 check_organism(organism) 315 316 lib = gp.get_library(lib_name, organism) 317 318 if organism.lower() in global_lib_orgs: 319 glo = global_lib_orgs.copy() 320 glo.remove(organism) 321 other_org = glo[0] 322 else: 323 other_org = '' 324 325 group_names = list(lib.keys()) 326 res = check_groups(group_names, key_terms, blacklist, other_org) 327 del res['num_groups'] 328 329 # Finding groups where group name matches key_terms 330 g_summary = pd.DataFrame(res) 331 332 if key_terms: 333 kept = g_summary['key_terms_in'] 334 kept_groups = g_summary['name'][kept].to_numpy() 335 g_summary = g_summary[kept] 336 else: 337 print("Not filtering with `key_terms` parameter.") 338 kept_groups = g_summary['name'].to_numpy() 339 340 if blacklist: 341 kept = ~g_summary['blacklist_in'] 342 kept_groups = g_summary['name'][kept].to_numpy() 343 else: 344 print("Not filtering with `blacklist` parameter.") 345 346 # Filtering library 347 lib = {group : lib[group] for group in kept_groups} 348 349 if 0 < len(genes): 350 del_groups = list() 351 genes = list(set(genes.copy())) 352 for group, features in lib.items(): 353 overlap = np.isin(features, genes) 354 overlap = np.sum(overlap) 355 if overlap < min_overlap: 356 print(overlap, flush=True) 357 del_groups.append(group) 358 359 # Removing genes without enough overlap 360 for group in del_groups: 361 print(f'Removing {group} from grouping.') 362 del lib[group] 363 364 else: 365 print("Not checking overlap between group and dataset features.") 366 367 return lib
29def check_organism(organism: str): 30 """ 31 Makes sure that organism is in availible organisms from `gseapy`. 32 33 Parameters 34 ---------- 35 organism : str 36 The organsim to check. 37 38 Returns 39 ------- 40 is_in : bool 41 `True` if the organism is valid. `False` if organism is not an 42 option. 43 """ 44 org_opts = {'human', 'mouse', 'yeast', 'fly', 'fish', 'worm'} 45 org_err = f"Invalid `organism`, choose from {org_opts}" 46 assert organism.lower() in org_opts, org_err 47 48 return None
Makes sure that organism is in availible organisms from gseapy.
Parameters
- organism (str): The organsim to check.
Returns
- is_in (bool):
Trueif the organism is valid.Falseif organism is not an option.
51def check_groups(groups: list, key_terms: str | list='', 52 blacklist: str | list | bool=False, other_org: str=''): 53 """ 54 Takes a list of groups from a gene set library and checks the names 55 for the desired gene sets. Returns a dictionary with keys 56 `'key_terms_in'` that is `list` of `bool`s corresponding to 57 `names`. `'num_groups'` value is an int for how many groups are in 58 the library. 59 60 Parameters 61 ---------- 62 groups : list 63 The names of groups for a given library. 64 65 key_terms : str | list 66 The types of cells or other specifiers the gene set is for 67 (example: 'CD4 T', 'kidney', ect...). 68 69 other_org : str 70 Either `'human'` or `'mouse'`. The organsim to ignore when 71 checking groups. Should be empty if target organism is not 72 `'human'` or `'mouse'`. 73 74 Returns 75 ------- 76 result : dict 77 A dictionary with the following keys and values: 78 79 `'name'` : list 80 The names of each group in input `'groups'` if the name 81 does not contain `'other_org'`. 82 83 `'key_terms_in'` : list 84 A boolean list repective to `'name'` indicating if at 85 least one key word from `'key_terms'` is present in the group name. 86 87 `'num_groups'` : int 88 The length of `result['name']`. 89 """ 90 result = { 91 'key_terms_in' : list(), 92 'blacklist_in' : list(), 93 'name' : list(), 94 } 95 96 for group_name in groups: 97 98 if not other_org in group_name.lower(): 99 100 if list == type(key_terms): 101 key_terms_in = any([k.lower() in group_name.lower() 102 for k in key_terms]) 103 else: 104 key_terms_in = key_terms.lower() in group_name.lower() 105 106 if list == type(blacklist): 107 blacklist_in = any([bl.lower() in group_name.lower() 108 for bl in blacklist]) 109 elif str == type(blacklist): 110 blacklist_in = blacklist.lower() in group_name.lower() 111 else: 112 blacklist_in = False 113 114 result['key_terms_in'].append(key_terms_in) 115 result['blacklist_in'].append(blacklist_in) 116 result['name'].append(group_name) 117 118 result['num_groups'] = len(result['name']) 119 120 return result
Takes a list of groups from a gene set library and checks the names
for the desired gene sets. Returns a dictionary with keys
'key_terms_in' that is list of bools corresponding to
names. 'num_groups' value is an int for how many groups are in
the library.
Parameters
- groups (list): The names of groups for a given library.
- key_terms (str | list): The types of cells or other specifiers the gene set is for (example: 'CD4 T', 'kidney', ect...).
- other_org (str):
Either
'human'or'mouse'. The organsim to ignore when checking groups. Should be empty if target organism is not'human'or'mouse'.
Returns
result (dict): A dictionary with the following keys and values:
'name': list The names of each group in input'groups'if the name does not contain'other_org'.'key_terms_in': list A boolean list repective to'name'indicating if at least one key word from'key_terms'is present in the group name.'num_groups': int The length ofresult['name'].
123def check_libs(libs, key_terms: str | list='', 124 blacklist: str | list | bool=False, other_org: str=''): 125 """ 126 Checks libraries for desired `key_terms` in groups. 127 128 Parameters 129 ---------- 130 libs : dict 131 A dictionary as `libs[library_name] = library_groups`. 132 133 key_terms : str | list 134 A `str` or `list` of `str`s to seach for in `libs` group names. 135 136 other_org : str 137 Only applicable when desired organism is `'human'` or 138 `'mouse'`. If desired organism is `'human'`, `other_org` 139 should be `'mouse'` and vice-versa. 140 141 Returns 142 ------- 143 summary, tally : pd.DataFrame | pd.DataFrame 144 `summary` has cols `['Library', 'No. Gene Sets', 145 'No. Key Terms Matching']` where `'Library'` is the library from 146 `gseapy` with `'No. Gene Sets'` and `'No. Key Terms Matching'` 147 corresponding. `'No. Key Terms Matching'` only included if 148 `key_terms` argument is provided. `tally` has cols `['library', 149 'key_terms_in', 'name']` 150 """ 151 num_groups = dict() 152 153 tally = { 154 'library' : list(), 155 'key_terms_in' : list(), 156 'blacklist_in' : list(), 157 'name' : list() 158 } 159 160 for library, groups in libs.items(): 161 res = check_groups(list(groups.keys()), key_terms, 162 blacklist, other_org) 163 164 lib_repeats = [library]*len(res['name']) 165 166 tally['library'].extend(lib_repeats) 167 tally['key_terms_in'].extend(res['key_terms_in']) 168 tally['blacklist_in'].extend(res['blacklist_in']) 169 tally['name'].extend(res['name']) 170 171 num_groups[library] = res['num_groups'] 172 173 tally = pd.DataFrame(tally) 174 175 key_dict = tally.copy() 176 key_dict = key_dict.groupby('library')['key_terms_in'].sum().reset_index() 177 key_dict = dict(zip(key_dict['library'], key_dict['key_terms_in'])) 178 179 bl_dict = tally.copy() 180 bl_dict = tally.groupby('library')['blacklist_in'].sum().reset_index() 181 bl_dict = dict(zip(bl_dict['library'], bl_dict['blacklist_in'])) 182 183 lib_names = np.unique(tally['library']) 184 key_counts = [key_dict[l] for l in lib_names] 185 bl_counts = [bl_dict[l] for l in lib_names] 186 n_groups = [num_groups[l] for l in lib_names] 187 188 summary = { 189 'Library' : lib_names, 190 'No. Gene Sets' : n_groups 191 } 192 193 if key_terms: 194 summary['No. Key Terms Matching'] = key_counts 195 if blacklist: 196 summary['No. Blacklist Matching'] = bl_counts 197 198 summary = pd.DataFrame(summary) 199 200 return summary, tally
Checks libraries for desired key_terms in groups.
Parameters
- libs (dict):
A dictionary as
libs[library_name] = library_groups. - key_terms (str | list):
A
strorlistofstrs to seach for inlibsgroup names. - other_org (str):
Only applicable when desired organism is
'human'or'mouse'. If desired organism is'human',other_orgshould be'mouse'and vice-versa.
Returns
- summary, tally (pd.DataFrame | pd.DataFrame):
summaryhas cols['Library', 'No. Gene Sets', 'No. Key Terms Matching']where'Library'is the library fromgseapywith'No. Gene Sets'and'No. Key Terms Matching'corresponding.'No. Key Terms Matching'only included ifkey_termsargument is provided.tallyhas cols['library', 'key_terms_in', 'name']
203def find_candidates(organism: str='human', key_terms: str | list='', blacklist: str | list | bool=False): 204 """ 205 Given `organism` and `key_terms`, will search for gene 206 groupings that could fit the datasets/classification task. 207 `blacklist` terms undesired in group names. 208 209 Parameters 210 ---------- 211 organism : str 212 The species the gene grouping is for. Options are 213 `{'Human', 'Mouse', 'Yeast', 'Fly', 'Fish', 'Worm'}` 214 215 key_terms : str | list 216 The types of cells or other specifiers the gene set is for 217 (example: 'CD4 T'). 218 219 blacklist : str | list | bool 220 Term(s) undesired in group names. Ignored unless provided. 221 222 Returns 223 ------- 224 libraries : list 225 A list of gene set library names that could serve for the 226 dataset/classification task. 227 228 Examples 229 -------- 230 >>> scmkl.find_candidates('human', key_terms=' b ') 231 Library No. Gene Sets 232 0 Azimuth_2023 1241 233 1 Azimuth_Cell_Types_2021 341 234 2 Cancer_Cell_Line_Encyclopedia 967 235 3 CellMarker_2024 1134 236 No. Key Type Matching 237 9 238 9 239 0 240 21 241 """ 242 check_organism(organism) 243 244 if organism.lower() in global_lib_orgs: 245 glo = global_lib_orgs.copy() 246 glo.remove(organism) 247 other_org = glo[0] 248 libs = human_genesets 249 libs = [name for name in libs if not other_org in name.lower()] 250 else: 251 libs = gp.get_library_name(organism) 252 other_org = '' 253 254 libs = {name : gp.get_library(name, organism) 255 for name in libs} 256 257 libs_df, _ = check_libs(libs, key_terms, blacklist, other_org) 258 259 return libs_df
Given organism and key_terms, will search for gene
groupings that could fit the datasets/classification task.
blacklist terms undesired in group names.
Parameters
- organism (str):
The species the gene grouping is for. Options are
{'Human', 'Mouse', 'Yeast', 'Fly', 'Fish', 'Worm'} - key_terms (str | list): The types of cells or other specifiers the gene set is for (example: 'CD4 T').
- blacklist (str | list | bool): Term(s) undesired in group names. Ignored unless provided.
Returns
- libraries (list): A list of gene set library names that could serve for the dataset/classification task.
Examples
>>> scmkl.find_candidates('human', key_terms=' b ')
Library No. Gene Sets
0 Azimuth_2023 1241
1 Azimuth_Cell_Types_2021 341
2 Cancer_Cell_Line_Encyclopedia 967
3 CellMarker_2024 1134
No. Key Type Matching
9
9
0
21
262def get_gene_groupings(lib_name: str, organism: str='human', key_terms: str | list='', 263 blacklist: str | list | bool=False, min_overlap: int=2, 264 genes: list | tuple | pd.Series | np.ndarray | set=[]): 265 """ 266 Takes a gene set library name and filters to groups containing 267 element(s) in `key_terms`. If genes is provided, will 268 ensure that there are at least `min_overlap` number of genes in 269 each group. Resulting groups will meet all of the before-mentioned 270 criteria if `isin_logic` is `'and'` | `'or'`. 271 272 Parameters 273 ---------- 274 lib_name : str 275 The desired library name. 276 277 organism : str 278 The species the gene grouping is for. Options are 279 `{'Human', 'Mouse', 'Yeast', 'Fly', 'Fish', 'Worm'}`. 280 281 key_terms : str | list 282 The types of cells or other specifiers the gene set is for 283 (example: 'CD4 T'). 284 285 genes : array_like 286 A vector of genes from the reference/query datasets. If not 287 assigned, function will not filter groups based on feature 288 overlap. 289 290 min_overlap : int 291 The minimum number of genes that must be present in a group 292 for it to be kept. If `genes` is not given, ignored. 293 294 Returns 295 ------- 296 lib : dict 297 The filtered library as a `dict` where keys are group names 298 and keys are features. 299 300 Examples 301 -------- 302 >>> dataset_feats = [ 303 ... 'FUCA1', 'CLIC4', 'STMN1', 'SYF2', 'TAS1R1', 304 ... 'NOL9', 'TAS1R3', 'SLC2A5', 'THAP3', 'IGHM', 305 ... 'MARCKS', 'BANK1', 'TNFRSF13B', 'IGKC', 'IGHD', 306 ... 'LINC01857', 'CD24', 'CD37', 'IGHD', 'RALGPS2' 307 ... ] 308 >>> rna_grouping = scmkl.get_gene_groupings( 309 ... 'Azimuth_2023', key_terms=[' b ', 'b cell', 'b '], 310 ... genes=dataset_feats) 311 >>> 312 >>> rna_groupings.keys() 313 dict_keys(['PBMC-L1-B Cell', 'PBMC-L2-Intermediate B Cell', ...]) 314 """ 315 check_organism(organism) 316 317 lib = gp.get_library(lib_name, organism) 318 319 if organism.lower() in global_lib_orgs: 320 glo = global_lib_orgs.copy() 321 glo.remove(organism) 322 other_org = glo[0] 323 else: 324 other_org = '' 325 326 group_names = list(lib.keys()) 327 res = check_groups(group_names, key_terms, blacklist, other_org) 328 del res['num_groups'] 329 330 # Finding groups where group name matches key_terms 331 g_summary = pd.DataFrame(res) 332 333 if key_terms: 334 kept = g_summary['key_terms_in'] 335 kept_groups = g_summary['name'][kept].to_numpy() 336 g_summary = g_summary[kept] 337 else: 338 print("Not filtering with `key_terms` parameter.") 339 kept_groups = g_summary['name'].to_numpy() 340 341 if blacklist: 342 kept = ~g_summary['blacklist_in'] 343 kept_groups = g_summary['name'][kept].to_numpy() 344 else: 345 print("Not filtering with `blacklist` parameter.") 346 347 # Filtering library 348 lib = {group : lib[group] for group in kept_groups} 349 350 if 0 < len(genes): 351 del_groups = list() 352 genes = list(set(genes.copy())) 353 for group, features in lib.items(): 354 overlap = np.isin(features, genes) 355 overlap = np.sum(overlap) 356 if overlap < min_overlap: 357 print(overlap, flush=True) 358 del_groups.append(group) 359 360 # Removing genes without enough overlap 361 for group in del_groups: 362 print(f'Removing {group} from grouping.') 363 del lib[group] 364 365 else: 366 print("Not checking overlap between group and dataset features.") 367 368 return lib
Takes a gene set library name and filters to groups containing
element(s) in key_terms. If genes is provided, will
ensure that there are at least min_overlap number of genes in
each group. Resulting groups will meet all of the before-mentioned
criteria if isin_logic is 'and' | 'or'.
Parameters
- lib_name (str): The desired library name.
- organism (str):
The species the gene grouping is for. Options are
{'Human', 'Mouse', 'Yeast', 'Fly', 'Fish', 'Worm'}. - key_terms (str | list): The types of cells or other specifiers the gene set is for (example: 'CD4 T').
- genes (array_like): A vector of genes from the reference/query datasets. If not assigned, function will not filter groups based on feature overlap.
- min_overlap (int):
The minimum number of genes that must be present in a group
for it to be kept. If
genesis not given, ignored.
Returns
- lib (dict):
The filtered library as a
dictwhere keys are group names and keys are features.
Examples
>>> dataset_feats = [
... 'FUCA1', 'CLIC4', 'STMN1', 'SYF2', 'TAS1R1',
... 'NOL9', 'TAS1R3', 'SLC2A5', 'THAP3', 'IGHM',
... 'MARCKS', 'BANK1', 'TNFRSF13B', 'IGKC', 'IGHD',
... 'LINC01857', 'CD24', 'CD37', 'IGHD', 'RALGPS2'
... ]
>>> rna_grouping = scmkl.get_gene_groupings(
... 'Azimuth_2023', key_terms=[' b ', 'b cell', 'b '],
... genes=dataset_feats)
>>>
>>> rna_groupings.keys()
dict_keys(['PBMC-L1-B Cell', 'PBMC-L2-Intermediate B Cell', ...])