scmkl.get_region_groupings
1import pandas as pd 2import numpy as np 3import re 4 5 6def _find_overlap(start1 : int, end1 : int, start2 : int, end2 : int) -> bool: 7 """ 8 Function to determine whether two regions on the same chromosome 9 overlap. 10 11 Parameters 12 ---------- 13 start1 : int 14 The start position for region 1 15 16 end1 : int 17 The end position for region 1 18 19 start2 : int 20 The start position for region 2 21 22 end2: int 23 The end postion for region 2 24 25 Returns 26 ------- 27 is_overlapping : bool 28 `True` if the regions overlap by 1bp. `False` if the regions 29 do not overlap 30 """ 31 return max(start1, start2) <= min(end1, end2) 32 33 34def get_tss(row : pd.DataFrame) -> int: 35 """ 36 Takes a row from a DataFrame as a DataFrame with columns 37 ['start', 'end', 'strand'] and returns the transcription start site 38 depending on gene strandedness. 39 40 Parameters 41 ---------- 42 row : pd.DataFrame 43 A gtf dataframe containing only one row and columns 44 `['start', 'end', 'strand']`. 45 46 Returns 47 ------- 48 tss : int 49 The transcription start site for row's respective annotation. 50 """ 51 if row.iloc[2] == '+': 52 return row.iloc[0] 53 54 elif row.iloc[2] == '-': 55 return row.iloc[1] 56 57 else: 58 raise ValueError("Strand symbol must be '+' or '-'") 59 60 61def calc_range(row : pd.DataFrame, len_up : int, len_down : int) -> list: 62 """ 63 Returns an infered promotor region for given annotation range 64 depending on transcription start site and user-defined upstream 65 and downstream adjustments. 66 67 Parameters 68 ---------- 69 row : pd.DataFrame 70 A gtf as a dataframe containing only one row and columns 71 `['tss', 'strand']` where `'tss'` is the transcription start 72 site. 73 74 len_up : int 75 Number of base pairs upstream from the transcription 76 start site the promotor range should be adjusted for. 77 78 len_down : int 79 Number of base pairs downstream from the transcription start 80 site the promotor range should be adjusted for. 81 82 Returns 83 ------- 84 start_end : list 85 A 2 element list where the first element is the adjusted start 86 position and the second the the adjusted end position. 87 """ 88 if row.iloc[1] == '+': 89 start = row.iloc[0] - len_up 90 end = row.iloc[0] + len_down 91 92 elif row.iloc[1] == '-': 93 start = row.iloc[0] - len_down 94 end = row.iloc[0] + len_up 95 96 else: 97 raise ValueError('Strand symbol must be "+" or "-"') 98 99 return [start, end] 100 101 102def adjust_regions(gene_anno : pd.DataFrame, len_up : int, len_down : int 103 ) -> pd.DataFrame: 104 """ 105 Takes a GTF file as a pd.DataFrame and adjusts start and end 106 positions to represent promotor regions given user-defined 107 upstream and downstream adjustments relative to the transcription 108 start site. 109 110 Parameters 111 ---------- 112 gene_anno : pd.DataFrame 113 A dataframe with columns `['chr', 'start', 'end', 'strand', 114 'gene_name']` created from a GTF file. 115 116 len_up : int 117 Number of base pairs upstream of the transcription start site 118 the promotor region should be adjusted for. 119 120 len_down : int 121 Number base pairs downstream of the transcription start site 122 the promotor region should be adjusted for. 123 124 Returns 125 ------- 126 gene_anno : pd.DataFrame 127 A dataframe where `['start', 'end']` columns represent the 128 start and end positions of inferred promotor regions for each 129 annotation. 130 """ 131 # Subsetting DataFrame to only required data 132 required_cols = ['chr', 'start', 'end', 'strand', 'gene_name'] 133 gene_anno = gene_anno[required_cols] 134 135 # Adding a column that annotates transcription start site based on strand 136 tss = gene_anno[['start', 'end', 'strand']].apply(get_tss, axis = 1) 137 gene_anno.insert(1, column = 'tss', value = tss) 138 139 # Calculating start and end positions based on the tss 140 adj_regions = gene_anno[['tss', 'strand']].apply(lambda row: 141 calc_range( 142 row = row, 143 len_up = len_up, 144 len_down = len_down), 145 axis = 1, 146 result_type = 'expand' 147 ) 148 149 gene_anno.loc[:, ['start', 'end']] = adj_regions.values 150 151 return gene_anno 152 153 154def create_region_dicts(gene_anno : pd.DataFrame) -> dict: 155 """ 156 Takes a GTF as a pd.DataFrame and returns dictionaries required for 157 region comparisons between gene_annotations and assay features. 158 159 Parameters 160 ---------- 161 gene_anno : pd.DataFrame 162 A dataframe with columns `['chr', 'start', 'end', 'strand', 163 'gene_name']` created from a GTF file. 164 165 Returns 166 ------- 167 peak_gene_dict : dict 168 Keys are regions and values are genes. 169 170 ga_regions : dict 171 Chromosomes are keys and regions are values. 172 """ 173 peak_gene_dict = {} 174 ga_regions = {} 175 176 for _, anno in gene_anno.iterrows(): 177 178 cur_chr = anno['chr'] 179 cur_start = int(anno['start']) 180 cur_end = int(anno['end']) 181 cur_name = anno['gene_name'] 182 183 peak_gene_dict[(cur_chr, cur_start, cur_end)] = cur_name 184 185 cur_region = np.array([[cur_start, cur_end]], dtype = int) 186 if cur_chr in ga_regions.keys(): 187 ga_regions[cur_chr] = np.concatenate((ga_regions[cur_chr], 188 cur_region), axis = 0) 189 190 else: 191 ga_regions[cur_chr] = cur_region 192 193 return peak_gene_dict, ga_regions 194 195 196def create_feature_dict(feature_names : list | set | np.ndarray | pd.Series 197 ) -> dict: 198 """ 199 Takes an array of feature names and returns data formatted for 200 feature comparisons. 201 202 Returns 203 ------- 204 feature_names : array_like 205 Array of region names from single-cell epigenetic data matrix. 206 207 Returns 208 ------- 209 feature_dict : dict 210 Keys are chromosomes and values are regions. 211 """ 212 feature_dict = {} 213 feature_names = [re.split(":|-", peak) for peak in feature_names] 214 for peak_set in feature_names: 215 if peak_set[0] in feature_dict.keys(): 216 feature_dict[peak_set[0]] = np.concatenate( 217 (feature_dict[peak_set[0]], 218 np.array([[peak_set[1], 219 peak_set[2]]], 220 dtype = int)), 221 axis = 0) 222 else: 223 feature_dict[peak_set[0]] = np.array([[peak_set[1], 224 peak_set[2]]], 225 dtype = int) 226 227 return feature_dict 228 229 230def compare_regions(feature_dict : dict, ga_regions : dict, 231 peak_gene_dict : dict, gene_sets : dict, chr_sep : str 232 ) -> dict: 233 """ 234 Takes features from a single-cell data matrix and regions from 235 a gene annotation file to return an region grouping where regions 236 from feature_dict and regions from gene annotations overlap. 237 238 Parameters 239 ---------- 240 feature_dict : dict 241 Keys are chromosomes and values are regions. This data should 242 come from a single-cell experiment. 243 244 ga_regions : dict 245 Keys are chromosomes and values are regions. This data should 246 come from a gene annotations (gtf) file. 247 248 peak_gene_dict : dict 249 Keys are peaks from gene annotation file and values are the 250 gene they are associated with. 251 252 gene_sets : dict 253 Keys are gene set names and values are an iterable object of 254 gene names. 255 256 chr_sep : str 257 The character that separates the chromosome from the rest of 258 the region in the original feature array. 259 260 Returns 261 ------- 262 epi_grouping : dict 263 Keys are the names from gene_sets and values are a list of 264 regions from `feature_names` that overlap with promotor regions 265 respective to genes in `gene_sets` (i.e., if region in 266 `feature_names` overlaps with promotor region from a gene in a 267 gene set from `gene_sets`, that region will be added to the new 268 dictionary under the respective gene set name). 269 """ 270 epi_grouping = {group : [] for group in gene_sets.keys()} 271 272 for chrom in feature_dict.keys(): 273 if chrom not in ga_regions.keys(): 274 continue 275 for region in feature_dict[chrom]: 276 for anno in ga_regions[chrom]: 277 if _find_overlap(region[0], region[1], anno[0], anno[1]): 278 gene = peak_gene_dict[(chrom, anno[0], anno[1])] 279 for group in gene_sets.keys(): 280 if gene in gene_sets[group]: 281 feat_region = ''.join((chrom, chr_sep, 282 str(region[0]), "-", 283 str(region[1]))) 284 epi_grouping[group].append(feat_region) 285 286 return epi_grouping 287 288 289def get_region_groupings(gene_anno : pd.DataFrame, gene_sets : dict, 290 feature_names : np.ndarray | pd.Series | list | set, 291 len_up : int = 5000, len_down : int = 5000) -> dict: 292 """ 293 Creates a peak set where keys are gene set names from `gene_sets` 294 and values are arrays of features pulled from `feature_names`. 295 Features are added to each peak set given overlap between regions 296 in single-cell data matrix and inferred gene promoter regions in 297 `gene_anno`. 298 299 Parameters 300 ---------- 301 gene_anno : pd.DataFrame 302 Gene annotations in GTF format as a pd.DataFrame with columns 303 `['chr', 'start', 'end', 'strand', 'gene_name']`. 304 305 gene_sets : dict 306 Gene set names as keys and an iterable object of gene names 307 as values. 308 309 feature_names : array_like | set 310 Feature names corresponding to a single_cell epigenetic data 311 matrix. 312 313 Returns 314 ------- 315 epi_grouping : dict 316 Keys are the names from `gene_sets` and values 317 are a list of regions from `feature_names` that overlap with 318 promotor regions respective to genes in `gene_sets` (i.e., if 319 region in `feature_names` overlaps with promotor region from a 320 gene in a gene set from `gene_sets`, that region will be added 321 to the new dictionary under the respective gene set name). 322 323 Examples 324 -------- 325 >>> # Reading in a gene set and the peak names from dataset 326 >>> gene_sets = np.load("data/RNA_hallmark_groupings.pkl", 327 ... allow_pickle = True) 328 >>> peaks = np.load("data/MCF7_region_names.npy", 329 ... allow_pickle = True) 330 >>> 331 >>> # Reading in GTF file 332 >>> gtf_path = "data/hg38_subset_protein_coding.annotation.gtf" 333 >>> gtf = scmkl.read_gtf(gtf_path, filter_to_coding=True) 334 >>> 335 >>> region_grouping = scmkl.get_region_groupings(gene_anno = gtf, 336 ... gene_sets = gene_sets, 337 ... feature_names = peaks) 338 >>> 339 >>> region_grouping.keys() 340 dict_keys(['HALLMARK_TNFA_SIGNALING_VIA_NFKB', ...]) 341 """ 342 # Getting a unique set of gene names from gene_sets 343 all_genes = {gene for group in gene_sets.keys() 344 for gene in gene_sets[group]} 345 346 # Filtering out NaN values 347 all_genes = [gene for gene in all_genes if type(gene) != float] 348 349 # Filtering out annotations for genes not present in gene_sets 350 gene_anno = gene_anno[np.isin(gene_anno['gene_name'], all_genes)] 351 352 # Adjusting start and end columns to represent promotor regions 353 gene_anno = adjust_regions(gene_anno = gene_anno, 354 len_up = len_up, len_down = len_down) 355 356 # Creating a dictionary from assay features where [chr] : (start, end) 357 feature_dict = create_feature_dict(feature_names) 358 359 # Creating data structures from gene_anno for comparing regions 360 peak_gene_dict, ga_regions = create_region_dicts(gene_anno) 361 362 # Capturing the separator type used in assay 363 chr_sep = ':' if ':' in feature_names[0] else '-' 364 365 epi_groupings = compare_regions(feature_dict = feature_dict, 366 ga_regions = ga_regions, 367 peak_gene_dict = peak_gene_dict, 368 gene_sets = gene_sets, 369 chr_sep = chr_sep) 370 371 return epi_groupings
35def get_tss(row : pd.DataFrame) -> int: 36 """ 37 Takes a row from a DataFrame as a DataFrame with columns 38 ['start', 'end', 'strand'] and returns the transcription start site 39 depending on gene strandedness. 40 41 Parameters 42 ---------- 43 row : pd.DataFrame 44 A gtf dataframe containing only one row and columns 45 `['start', 'end', 'strand']`. 46 47 Returns 48 ------- 49 tss : int 50 The transcription start site for row's respective annotation. 51 """ 52 if row.iloc[2] == '+': 53 return row.iloc[0] 54 55 elif row.iloc[2] == '-': 56 return row.iloc[1] 57 58 else: 59 raise ValueError("Strand symbol must be '+' or '-'")
Takes a row from a DataFrame as a DataFrame with columns ['start', 'end', 'strand'] and returns the transcription start site depending on gene strandedness.
Parameters
- row (pd.DataFrame):
A gtf dataframe containing only one row and columns
['start', 'end', 'strand']
.
Returns
- tss (int): The transcription start site for row's respective annotation.
62def calc_range(row : pd.DataFrame, len_up : int, len_down : int) -> list: 63 """ 64 Returns an infered promotor region for given annotation range 65 depending on transcription start site and user-defined upstream 66 and downstream adjustments. 67 68 Parameters 69 ---------- 70 row : pd.DataFrame 71 A gtf as a dataframe containing only one row and columns 72 `['tss', 'strand']` where `'tss'` is the transcription start 73 site. 74 75 len_up : int 76 Number of base pairs upstream from the transcription 77 start site the promotor range should be adjusted for. 78 79 len_down : int 80 Number of base pairs downstream from the transcription start 81 site the promotor range should be adjusted for. 82 83 Returns 84 ------- 85 start_end : list 86 A 2 element list where the first element is the adjusted start 87 position and the second the the adjusted end position. 88 """ 89 if row.iloc[1] == '+': 90 start = row.iloc[0] - len_up 91 end = row.iloc[0] + len_down 92 93 elif row.iloc[1] == '-': 94 start = row.iloc[0] - len_down 95 end = row.iloc[0] + len_up 96 97 else: 98 raise ValueError('Strand symbol must be "+" or "-"') 99 100 return [start, end]
Returns an infered promotor region for given annotation range depending on transcription start site and user-defined upstream and downstream adjustments.
Parameters
- row (pd.DataFrame):
A gtf as a dataframe containing only one row and columns
['tss', 'strand']
where'tss'
is the transcription start site. - len_up (int): Number of base pairs upstream from the transcription start site the promotor range should be adjusted for.
- len_down (int): Number of base pairs downstream from the transcription start site the promotor range should be adjusted for.
Returns
- start_end (list): A 2 element list where the first element is the adjusted start position and the second the the adjusted end position.
103def adjust_regions(gene_anno : pd.DataFrame, len_up : int, len_down : int 104 ) -> pd.DataFrame: 105 """ 106 Takes a GTF file as a pd.DataFrame and adjusts start and end 107 positions to represent promotor regions given user-defined 108 upstream and downstream adjustments relative to the transcription 109 start site. 110 111 Parameters 112 ---------- 113 gene_anno : pd.DataFrame 114 A dataframe with columns `['chr', 'start', 'end', 'strand', 115 'gene_name']` created from a GTF file. 116 117 len_up : int 118 Number of base pairs upstream of the transcription start site 119 the promotor region should be adjusted for. 120 121 len_down : int 122 Number base pairs downstream of the transcription start site 123 the promotor region should be adjusted for. 124 125 Returns 126 ------- 127 gene_anno : pd.DataFrame 128 A dataframe where `['start', 'end']` columns represent the 129 start and end positions of inferred promotor regions for each 130 annotation. 131 """ 132 # Subsetting DataFrame to only required data 133 required_cols = ['chr', 'start', 'end', 'strand', 'gene_name'] 134 gene_anno = gene_anno[required_cols] 135 136 # Adding a column that annotates transcription start site based on strand 137 tss = gene_anno[['start', 'end', 'strand']].apply(get_tss, axis = 1) 138 gene_anno.insert(1, column = 'tss', value = tss) 139 140 # Calculating start and end positions based on the tss 141 adj_regions = gene_anno[['tss', 'strand']].apply(lambda row: 142 calc_range( 143 row = row, 144 len_up = len_up, 145 len_down = len_down), 146 axis = 1, 147 result_type = 'expand' 148 ) 149 150 gene_anno.loc[:, ['start', 'end']] = adj_regions.values 151 152 return gene_anno
Takes a GTF file as a pd.DataFrame and adjusts start and end positions to represent promotor regions given user-defined upstream and downstream adjustments relative to the transcription start site.
Parameters
- gene_anno (pd.DataFrame):
A dataframe with columns
['chr', 'start', 'end', 'strand', 'gene_name']
created from a GTF file. - len_up (int): Number of base pairs upstream of the transcription start site the promotor region should be adjusted for.
- len_down (int): Number base pairs downstream of the transcription start site the promotor region should be adjusted for.
Returns
- gene_anno (pd.DataFrame):
A dataframe where
['start', 'end']
columns represent the start and end positions of inferred promotor regions for each annotation.
155def create_region_dicts(gene_anno : pd.DataFrame) -> dict: 156 """ 157 Takes a GTF as a pd.DataFrame and returns dictionaries required for 158 region comparisons between gene_annotations and assay features. 159 160 Parameters 161 ---------- 162 gene_anno : pd.DataFrame 163 A dataframe with columns `['chr', 'start', 'end', 'strand', 164 'gene_name']` created from a GTF file. 165 166 Returns 167 ------- 168 peak_gene_dict : dict 169 Keys are regions and values are genes. 170 171 ga_regions : dict 172 Chromosomes are keys and regions are values. 173 """ 174 peak_gene_dict = {} 175 ga_regions = {} 176 177 for _, anno in gene_anno.iterrows(): 178 179 cur_chr = anno['chr'] 180 cur_start = int(anno['start']) 181 cur_end = int(anno['end']) 182 cur_name = anno['gene_name'] 183 184 peak_gene_dict[(cur_chr, cur_start, cur_end)] = cur_name 185 186 cur_region = np.array([[cur_start, cur_end]], dtype = int) 187 if cur_chr in ga_regions.keys(): 188 ga_regions[cur_chr] = np.concatenate((ga_regions[cur_chr], 189 cur_region), axis = 0) 190 191 else: 192 ga_regions[cur_chr] = cur_region 193 194 return peak_gene_dict, ga_regions
Takes a GTF as a pd.DataFrame and returns dictionaries required for region comparisons between gene_annotations and assay features.
Parameters
- gene_anno (pd.DataFrame):
A dataframe with columns
['chr', 'start', 'end', 'strand', 'gene_name']
created from a GTF file.
Returns
- peak_gene_dict (dict): Keys are regions and values are genes.
- ga_regions (dict): Chromosomes are keys and regions are values.
197def create_feature_dict(feature_names : list | set | np.ndarray | pd.Series 198 ) -> dict: 199 """ 200 Takes an array of feature names and returns data formatted for 201 feature comparisons. 202 203 Returns 204 ------- 205 feature_names : array_like 206 Array of region names from single-cell epigenetic data matrix. 207 208 Returns 209 ------- 210 feature_dict : dict 211 Keys are chromosomes and values are regions. 212 """ 213 feature_dict = {} 214 feature_names = [re.split(":|-", peak) for peak in feature_names] 215 for peak_set in feature_names: 216 if peak_set[0] in feature_dict.keys(): 217 feature_dict[peak_set[0]] = np.concatenate( 218 (feature_dict[peak_set[0]], 219 np.array([[peak_set[1], 220 peak_set[2]]], 221 dtype = int)), 222 axis = 0) 223 else: 224 feature_dict[peak_set[0]] = np.array([[peak_set[1], 225 peak_set[2]]], 226 dtype = int) 227 228 return feature_dict
Takes an array of feature names and returns data formatted for feature comparisons.
Returns
- feature_names (array_like): Array of region names from single-cell epigenetic data matrix.
Returns
- feature_dict (dict): Keys are chromosomes and values are regions.
231def compare_regions(feature_dict : dict, ga_regions : dict, 232 peak_gene_dict : dict, gene_sets : dict, chr_sep : str 233 ) -> dict: 234 """ 235 Takes features from a single-cell data matrix and regions from 236 a gene annotation file to return an region grouping where regions 237 from feature_dict and regions from gene annotations overlap. 238 239 Parameters 240 ---------- 241 feature_dict : dict 242 Keys are chromosomes and values are regions. This data should 243 come from a single-cell experiment. 244 245 ga_regions : dict 246 Keys are chromosomes and values are regions. This data should 247 come from a gene annotations (gtf) file. 248 249 peak_gene_dict : dict 250 Keys are peaks from gene annotation file and values are the 251 gene they are associated with. 252 253 gene_sets : dict 254 Keys are gene set names and values are an iterable object of 255 gene names. 256 257 chr_sep : str 258 The character that separates the chromosome from the rest of 259 the region in the original feature array. 260 261 Returns 262 ------- 263 epi_grouping : dict 264 Keys are the names from gene_sets and values are a list of 265 regions from `feature_names` that overlap with promotor regions 266 respective to genes in `gene_sets` (i.e., if region in 267 `feature_names` overlaps with promotor region from a gene in a 268 gene set from `gene_sets`, that region will be added to the new 269 dictionary under the respective gene set name). 270 """ 271 epi_grouping = {group : [] for group in gene_sets.keys()} 272 273 for chrom in feature_dict.keys(): 274 if chrom not in ga_regions.keys(): 275 continue 276 for region in feature_dict[chrom]: 277 for anno in ga_regions[chrom]: 278 if _find_overlap(region[0], region[1], anno[0], anno[1]): 279 gene = peak_gene_dict[(chrom, anno[0], anno[1])] 280 for group in gene_sets.keys(): 281 if gene in gene_sets[group]: 282 feat_region = ''.join((chrom, chr_sep, 283 str(region[0]), "-", 284 str(region[1]))) 285 epi_grouping[group].append(feat_region) 286 287 return epi_grouping
Takes features from a single-cell data matrix and regions from a gene annotation file to return an region grouping where regions from feature_dict and regions from gene annotations overlap.
Parameters
- feature_dict (dict): Keys are chromosomes and values are regions. This data should come from a single-cell experiment.
- ga_regions (dict): Keys are chromosomes and values are regions. This data should come from a gene annotations (gtf) file.
- peak_gene_dict (dict): Keys are peaks from gene annotation file and values are the gene they are associated with.
- gene_sets (dict): Keys are gene set names and values are an iterable object of gene names.
- chr_sep (str): The character that separates the chromosome from the rest of the region in the original feature array.
Returns
- epi_grouping (dict):
Keys are the names from gene_sets and values are a list of
regions from
feature_names
that overlap with promotor regions respective to genes ingene_sets
(i.e., if region infeature_names
overlaps with promotor region from a gene in a gene set fromgene_sets
, that region will be added to the new dictionary under the respective gene set name).
290def get_region_groupings(gene_anno : pd.DataFrame, gene_sets : dict, 291 feature_names : np.ndarray | pd.Series | list | set, 292 len_up : int = 5000, len_down : int = 5000) -> dict: 293 """ 294 Creates a peak set where keys are gene set names from `gene_sets` 295 and values are arrays of features pulled from `feature_names`. 296 Features are added to each peak set given overlap between regions 297 in single-cell data matrix and inferred gene promoter regions in 298 `gene_anno`. 299 300 Parameters 301 ---------- 302 gene_anno : pd.DataFrame 303 Gene annotations in GTF format as a pd.DataFrame with columns 304 `['chr', 'start', 'end', 'strand', 'gene_name']`. 305 306 gene_sets : dict 307 Gene set names as keys and an iterable object of gene names 308 as values. 309 310 feature_names : array_like | set 311 Feature names corresponding to a single_cell epigenetic data 312 matrix. 313 314 Returns 315 ------- 316 epi_grouping : dict 317 Keys are the names from `gene_sets` and values 318 are a list of regions from `feature_names` that overlap with 319 promotor regions respective to genes in `gene_sets` (i.e., if 320 region in `feature_names` overlaps with promotor region from a 321 gene in a gene set from `gene_sets`, that region will be added 322 to the new dictionary under the respective gene set name). 323 324 Examples 325 -------- 326 >>> # Reading in a gene set and the peak names from dataset 327 >>> gene_sets = np.load("data/RNA_hallmark_groupings.pkl", 328 ... allow_pickle = True) 329 >>> peaks = np.load("data/MCF7_region_names.npy", 330 ... allow_pickle = True) 331 >>> 332 >>> # Reading in GTF file 333 >>> gtf_path = "data/hg38_subset_protein_coding.annotation.gtf" 334 >>> gtf = scmkl.read_gtf(gtf_path, filter_to_coding=True) 335 >>> 336 >>> region_grouping = scmkl.get_region_groupings(gene_anno = gtf, 337 ... gene_sets = gene_sets, 338 ... feature_names = peaks) 339 >>> 340 >>> region_grouping.keys() 341 dict_keys(['HALLMARK_TNFA_SIGNALING_VIA_NFKB', ...]) 342 """ 343 # Getting a unique set of gene names from gene_sets 344 all_genes = {gene for group in gene_sets.keys() 345 for gene in gene_sets[group]} 346 347 # Filtering out NaN values 348 all_genes = [gene for gene in all_genes if type(gene) != float] 349 350 # Filtering out annotations for genes not present in gene_sets 351 gene_anno = gene_anno[np.isin(gene_anno['gene_name'], all_genes)] 352 353 # Adjusting start and end columns to represent promotor regions 354 gene_anno = adjust_regions(gene_anno = gene_anno, 355 len_up = len_up, len_down = len_down) 356 357 # Creating a dictionary from assay features where [chr] : (start, end) 358 feature_dict = create_feature_dict(feature_names) 359 360 # Creating data structures from gene_anno for comparing regions 361 peak_gene_dict, ga_regions = create_region_dicts(gene_anno) 362 363 # Capturing the separator type used in assay 364 chr_sep = ':' if ':' in feature_names[0] else '-' 365 366 epi_groupings = compare_regions(feature_dict = feature_dict, 367 ga_regions = ga_regions, 368 peak_gene_dict = peak_gene_dict, 369 gene_sets = gene_sets, 370 chr_sep = chr_sep) 371 372 return epi_groupings
Creates a peak set where keys are gene set names from gene_sets
and values are arrays of features pulled from feature_names
.
Features are added to each peak set given overlap between regions
in single-cell data matrix and inferred gene promoter regions in
gene_anno
.
Parameters
- gene_anno (pd.DataFrame):
Gene annotations in GTF format as a pd.DataFrame with columns
['chr', 'start', 'end', 'strand', 'gene_name']
. - gene_sets (dict): Gene set names as keys and an iterable object of gene names as values.
- feature_names (array_like | set): Feature names corresponding to a single_cell epigenetic data matrix.
Returns
- epi_grouping (dict):
Keys are the names from
gene_sets
and values are a list of regions fromfeature_names
that overlap with promotor regions respective to genes ingene_sets
(i.e., if region infeature_names
overlaps with promotor region from a gene in a gene set fromgene_sets
, that region will be added to the new dictionary under the respective gene set name).
Examples
>>> # Reading in a gene set and the peak names from dataset
>>> gene_sets = np.load("data/RNA_hallmark_groupings.pkl",
... allow_pickle = True)
>>> peaks = np.load("data/MCF7_region_names.npy",
... allow_pickle = True)
>>>
>>> # Reading in GTF file
>>> gtf_path = "data/hg38_subset_protein_coding.annotation.gtf"
>>> gtf = scmkl.read_gtf(gtf_path, filter_to_coding=True)
>>>
>>> region_grouping = scmkl.get_region_groupings(gene_anno = gtf,
... gene_sets = gene_sets,
... feature_names = peaks)
>>>
>>> region_grouping.keys()
dict_keys(['HALLMARK_TNFA_SIGNALING_VIA_NFKB', ...])