scmkl.get_region_groupings

View Source

  1import pandas as pd
  2import numpy as np
  3import re
  4
  5
  6def _find_overlap(start1 : int, end1 : int, start2 : int, end2 : int) -> bool:
  7    """
  8    Function to determine whether two regions on the same chromosome 
  9    overlap.
 10    
 11    Parameters
 12    ----------
 13    start1 : int
 14        The start position for region 1
 15    
 16    end1 : int
 17        The end position for region 1
 18    
 19    start2 : int
 20        The start position for region 2
 21    
 22    end2: int
 23        The end postion for region 2
 24    
 25    Returns
 26    -------
 27    is_overlapping : bool
 28        `True` if the regions overlap by 1bp. `False` if the regions 
 29        do not overlap
 30    """
 31    return max(start1, start2) <= min(end1, end2)
 32
 33
 34def get_tss(row : pd.DataFrame) -> int:
 35    """
 36    Takes a row from a DataFrame as a DataFrame with columns 
 37    ['start', 'end', 'strand'] and returns the transcription start site
 38    depending on gene strandedness.
 39    
 40    Parameters
 41    ----------
 42    row : pd.DataFrame 
 43        A gtf dataframe containing only one row and columns 
 44        `['start', 'end', 'strand']`.
 45    
 46    Returns
 47    -------
 48    tss : int
 49        The transcription start site for row's respective annotation.
 50    """
 51    if row.iloc[2] == '+':
 52        return row.iloc[0]
 53    
 54    elif row.iloc[2] == '-':
 55        return row.iloc[1]
 56    
 57    else:
 58        raise ValueError("Strand symbol must be '+' or '-'")
 59    
 60
 61def calc_range(row : pd.DataFrame, len_up : int, len_down : int) -> list:
 62    """
 63    Returns an infered promotor region for given annotation range 
 64    depending on transcription start site and user-defined upstream 
 65    and downstream adjustments.
 66
 67    Parameters
 68    ----------
 69    row : pd.DataFrame 
 70        A gtf as a dataframe containing only one row and columns 
 71        `['tss', 'strand']` where `'tss'` is the transcription start 
 72        site.
 73    
 74    len_up : int 
 75        Number of base pairs upstream from the transcription 
 76        start site the promotor range should be adjusted for.
 77
 78    len_down : int
 79        Number of base pairs downstream from the transcription start 
 80        site the promotor range should be adjusted for.
 81    
 82    Returns
 83    -------
 84    start_end : list
 85        A 2 element list where the first element is the adjusted start
 86        position and the second the the adjusted end position.
 87    """
 88    if row.iloc[1] == '+':
 89        start = row.iloc[0] - len_up
 90        end = row.iloc[0] + len_down
 91
 92    elif row.iloc[1] == '-':
 93        start = row.iloc[0] - len_down
 94        end = row.iloc[0] + len_up
 95
 96    else:
 97        raise ValueError('Strand symbol must be "+" or "-"')
 98    
 99    return [start, end]
100
101
102def adjust_regions(gene_anno : pd.DataFrame, len_up : int, len_down : int
103                    ) -> pd.DataFrame:
104    """
105    Takes a GTF file as a pd.DataFrame and adjusts start and end 
106    positions to represent promotor regions given user-defined 
107    upstream and downstream adjustments relative to the transcription
108    start site.
109
110    Parameters
111    ----------
112    gene_anno : pd.DataFrame 
113        A dataframe with columns `['chr', 'start', 'end', 'strand', 
114        'gene_name']` created from a GTF file.
115    
116    len_up : int
117        Number of base pairs upstream of the transcription start site 
118        the promotor region should be adjusted for.
119    
120    len_down : int
121        Number base pairs downstream of the transcription start site 
122        the promotor region should be adjusted for.
123    
124    Returns
125    -------
126    gene_anno : pd.DataFrame
127        A dataframe where `['start', 'end']` columns represent the 
128        start and end positions of inferred promotor regions for each 
129        annotation.
130    """
131    # Subsetting DataFrame to only required data
132    required_cols = ['chr', 'start', 'end', 'strand', 'gene_name']
133    gene_anno = gene_anno[required_cols]
134
135    # Adding a column that annotates transcription start site based on strand
136    tss = gene_anno[['start', 'end', 'strand']].apply(get_tss, axis = 1)
137    gene_anno.insert(1, column = 'tss', value = tss)
138
139    # Calculating start and end positions based on the tss
140    adj_regions = gene_anno[['tss', 'strand']].apply(lambda row:
141                                                     calc_range(
142                                                        row = row,
143                                                        len_up = len_up,
144                                                        len_down = len_down), 
145                                                   axis = 1, 
146                                                   result_type = 'expand'
147                                                   )
148    
149    gene_anno.loc[:, ['start', 'end']] = adj_regions.values
150    
151    return gene_anno
152
153
154def create_region_dicts(gene_anno : pd.DataFrame) -> dict:
155    """
156    Takes a GTF as a pd.DataFrame and returns dictionaries required for
157    region comparisons between gene_annotations and assay features.
158
159    Parameters
160    ----------
161    gene_anno : pd.DataFrame 
162        A dataframe with columns `['chr', 'start', 'end', 'strand', 
163        'gene_name']` created from a GTF file.
164    
165    Returns
166    -------
167    peak_gene_dict : dict
168        Keys are regions and values are genes.
169        
170    ga_regions : dict
171        Chromosomes are keys and regions are values.
172    """
173    peak_gene_dict = {}
174    ga_regions = {}
175
176    for _, anno in gene_anno.iterrows():
177
178        cur_chr = anno['chr']
179        cur_start = int(anno['start'])
180        cur_end = int(anno['end'])
181        cur_name = anno['gene_name']
182
183        peak_gene_dict[(cur_chr, cur_start, cur_end)] = cur_name
184
185        cur_region = np.array([[cur_start, cur_end]], dtype = int)
186        if cur_chr in ga_regions.keys():
187            ga_regions[cur_chr] = np.concatenate((ga_regions[cur_chr], 
188                                                  cur_region), axis = 0)
189        
190        else:
191            ga_regions[cur_chr] = cur_region
192
193    return peak_gene_dict, ga_regions
194
195
196def create_feature_dict(feature_names : list | set | np.ndarray | pd.Series
197                         ) -> dict:
198    """
199    Takes an array of feature names and returns data formatted for
200    feature comparisons.
201    
202    Returns
203    -------
204    feature_names : array_like
205        Array of region names from single-cell epigenetic data matrix.
206    
207    Returns
208    -------
209    feature_dict : dict
210        Keys are chromosomes and values are regions.
211    """
212    feature_dict = {}
213    feature_names = [re.split(":|-", peak) for peak in feature_names]
214    for peak_set in feature_names:
215        if peak_set[0] in feature_dict.keys():
216            feature_dict[peak_set[0]] = np.concatenate(
217                                                (feature_dict[peak_set[0]],
218                                                    np.array([[peak_set[1], 
219                                                                peak_set[2]]], 
220                                                                dtype = int)), 
221                                                                axis = 0)
222        else:
223            feature_dict[peak_set[0]] = np.array([[peak_set[1], 
224                                                   peak_set[2]]], 
225                                                   dtype = int)
226
227    return feature_dict
228
229
230def compare_regions(feature_dict : dict, ga_regions : dict,
231                     peak_gene_dict : dict, gene_sets : dict, chr_sep : str
232                     ) -> dict:
233    """
234    Takes features from a single-cell data matrix and regions from
235    a gene annotation file to return an region grouping where regions 
236    from feature_dict and regions from gene annotations overlap.
237
238    Parameters
239    ----------
240    feature_dict : dict
241        Keys are chromosomes and values are regions. This data should 
242        come from a single-cell experiment.
243
244    ga_regions : dict
245        Keys are chromosomes and values are regions. This data should 
246        come from a gene annotations (gtf) file.
247
248    peak_gene_dict : dict
249        Keys are peaks from gene annotation file and values are the 
250        gene they are associated with.
251
252    gene_sets : dict
253        Keys are gene set names and values are an iterable object of 
254        gene names.
255
256    chr_sep : str
257        The character that separates the chromosome from the rest of 
258        the region in the original feature array.
259    
260    Returns
261    -------
262    epi_grouping : dict
263        Keys are the names from gene_sets and values are a list of 
264        regions from `feature_names` that overlap with promotor regions 
265        respective to genes in `gene_sets` (i.e., if region in 
266        `feature_names` overlaps with promotor region from a gene in a 
267        gene set from `gene_sets`, that region will be added to the new 
268        dictionary under the respective gene set name).
269    """
270    epi_grouping = {group : [] for group in gene_sets.keys()}
271
272    for chrom in feature_dict.keys():
273        if chrom not in ga_regions.keys():
274            continue
275        for region in feature_dict[chrom]:
276            for anno in ga_regions[chrom]:
277                if _find_overlap(region[0], region[1], anno[0], anno[1]):
278                    gene = peak_gene_dict[(chrom, anno[0], anno[1])]
279                    for group in gene_sets.keys():
280                        if gene in gene_sets[group]:
281                            feat_region = ''.join((chrom, chr_sep, 
282                                                  str(region[0]), "-", 
283                                                  str(region[1])))
284                            epi_grouping[group].append(feat_region)
285
286    return epi_grouping
287
288
289def get_region_groupings(gene_anno : pd.DataFrame, gene_sets : dict, 
290                       feature_names : np.ndarray | pd.Series | list | set,
291                       len_up : int = 5000, len_down : int = 5000) -> dict:
292    """
293    Creates a peak set where keys are gene set names from `gene_sets` 
294    and values are arrays of features pulled from `feature_names`. 
295    Features are added to each peak set given overlap between regions 
296    in single-cell data matrix and inferred gene promoter regions in 
297    `gene_anno`.
298
299    Parameters
300    ----------
301    gene_anno : pd.DataFrame
302        Gene annotations in GTF format as a pd.DataFrame with columns
303        `['chr', 'start', 'end', 'strand', 'gene_name']`.
304
305    gene_sets : dict
306        Gene set names as keys and an iterable object of gene names
307        as values.
308
309    feature_names : array_like | set
310        Feature names corresponding to a single_cell epigenetic data 
311        matrix.
312
313    Returns
314    -------
315    epi_grouping : dict
316        Keys are the names from `gene_sets` and values
317        are a list of regions from `feature_names` that overlap with 
318        promotor regions respective to genes in `gene_sets` (i.e., if 
319        region in `feature_names` overlaps with promotor region from a 
320        gene in a gene set from `gene_sets`, that region will be added 
321        to the new dictionary under the respective gene set name).
322
323    Examples
324    --------
325    >>> # Reading in a gene set and the peak names from dataset
326    >>> gene_sets = np.load("data/RNA_hallmark_groupings.pkl", 
327    ...                     allow_pickle = True)
328    >>> peaks = np.load("data/MCF7_region_names.npy", 
329    ...                 allow_pickle = True)
330    >>> 
331    >>> # Reading in GTF file
332    >>> gtf_path = "data/hg38_subset_protein_coding.annotation.gtf"
333    >>> gtf = scmkl.read_gtf(gtf_path, filter_to_coding=True)
334    >>>
335    >>> region_grouping = scmkl.get_region_groupings(gene_anno = gtf,
336    ...                                         gene_sets = gene_sets,
337    ...                                         feature_names = peaks)
338    >>>
339    >>> region_grouping.keys()
340    dict_keys(['HALLMARK_TNFA_SIGNALING_VIA_NFKB', ...])
341    """
342    # Getting a unique set of gene names from gene_sets
343    all_genes = {gene for group in gene_sets.keys()
344                 for gene in gene_sets[group]}
345    
346    # Filtering out NaN values
347    all_genes = [gene for gene in all_genes if type(gene) != float]
348
349    # Filtering out annotations for genes not present in gene_sets
350    gene_anno = gene_anno[np.isin(gene_anno['gene_name'], all_genes)]
351
352    # Adjusting start and end columns to represent promotor regions
353    gene_anno = adjust_regions(gene_anno = gene_anno, 
354                                len_up = len_up, len_down = len_down)
355
356    # Creating a dictionary from assay features where [chr] : (start, end)
357    feature_dict = create_feature_dict(feature_names)
358
359    # Creating data structures from gene_anno for comparing regions
360    peak_gene_dict, ga_regions = create_region_dicts(gene_anno)
361
362    # Capturing the separator type used in assay
363    chr_sep = ':' if ':' in feature_names[0] else '-'
364
365    epi_groupings = compare_regions(feature_dict = feature_dict,
366                                     ga_regions = ga_regions,
367                                     peak_gene_dict = peak_gene_dict,
368                                     gene_sets = gene_sets,
369                                     chr_sep = chr_sep)
370    
371    return epi_groupings

def get_tss(row: pandas.core.frame.DataFrame) -> int: View Source

35def get_tss(row : pd.DataFrame) -> int:
36    """
37    Takes a row from a DataFrame as a DataFrame with columns 
38    ['start', 'end', 'strand'] and returns the transcription start site
39    depending on gene strandedness.
40    
41    Parameters
42    ----------
43    row : pd.DataFrame 
44        A gtf dataframe containing only one row and columns 
45        `['start', 'end', 'strand']`.
46    
47    Returns
48    -------
49    tss : int
50        The transcription start site for row's respective annotation.
51    """
52    if row.iloc[2] == '+':
53        return row.iloc[0]
54    
55    elif row.iloc[2] == '-':
56        return row.iloc[1]
57    
58    else:
59        raise ValueError("Strand symbol must be '+' or '-'")

Takes a row from a DataFrame as a DataFrame with columns ['start', 'end', 'strand'] and returns the transcription start site depending on gene strandedness.

Parameters

row (pd.DataFrame): A gtf dataframe containing only one row and columns ['start', 'end', 'strand'].

Returns

tss (int): The transcription start site for row's respective annotation.

def calc_range(row: pandas.core.frame.DataFrame, len_up: int, len_down: int) -> list: View Source

 62def calc_range(row : pd.DataFrame, len_up : int, len_down : int) -> list:
 63    """
 64    Returns an infered promotor region for given annotation range 
 65    depending on transcription start site and user-defined upstream 
 66    and downstream adjustments.
 67
 68    Parameters
 69    ----------
 70    row : pd.DataFrame 
 71        A gtf as a dataframe containing only one row and columns 
 72        `['tss', 'strand']` where `'tss'` is the transcription start 
 73        site.
 74    
 75    len_up : int 
 76        Number of base pairs upstream from the transcription 
 77        start site the promotor range should be adjusted for.
 78
 79    len_down : int
 80        Number of base pairs downstream from the transcription start 
 81        site the promotor range should be adjusted for.
 82    
 83    Returns
 84    -------
 85    start_end : list
 86        A 2 element list where the first element is the adjusted start
 87        position and the second the the adjusted end position.
 88    """
 89    if row.iloc[1] == '+':
 90        start = row.iloc[0] - len_up
 91        end = row.iloc[0] + len_down
 92
 93    elif row.iloc[1] == '-':
 94        start = row.iloc[0] - len_down
 95        end = row.iloc[0] + len_up
 96
 97    else:
 98        raise ValueError('Strand symbol must be "+" or "-"')
 99    
100    return [start, end]

Returns an infered promotor region for given annotation range depending on transcription start site and user-defined upstream and downstream adjustments.

Parameters

row (pd.DataFrame): A gtf as a dataframe containing only one row and columns ['tss', 'strand'] where 'tss' is the transcription start site.
len_up (int): Number of base pairs upstream from the transcription start site the promotor range should be adjusted for.
len_down (int): Number of base pairs downstream from the transcription start site the promotor range should be adjusted for.

Returns

start_end (list): A 2 element list where the first element is the adjusted start position and the second the the adjusted end position.

def adjust_regions( gene_anno: pandas.core.frame.DataFrame, len_up: int, len_down: int) -> pandas.core.frame.DataFrame: View Source

103def adjust_regions(gene_anno : pd.DataFrame, len_up : int, len_down : int
104                    ) -> pd.DataFrame:
105    """
106    Takes a GTF file as a pd.DataFrame and adjusts start and end 
107    positions to represent promotor regions given user-defined 
108    upstream and downstream adjustments relative to the transcription
109    start site.
110
111    Parameters
112    ----------
113    gene_anno : pd.DataFrame 
114        A dataframe with columns `['chr', 'start', 'end', 'strand', 
115        'gene_name']` created from a GTF file.
116    
117    len_up : int
118        Number of base pairs upstream of the transcription start site 
119        the promotor region should be adjusted for.
120    
121    len_down : int
122        Number base pairs downstream of the transcription start site 
123        the promotor region should be adjusted for.
124    
125    Returns
126    -------
127    gene_anno : pd.DataFrame
128        A dataframe where `['start', 'end']` columns represent the 
129        start and end positions of inferred promotor regions for each 
130        annotation.
131    """
132    # Subsetting DataFrame to only required data
133    required_cols = ['chr', 'start', 'end', 'strand', 'gene_name']
134    gene_anno = gene_anno[required_cols]
135
136    # Adding a column that annotates transcription start site based on strand
137    tss = gene_anno[['start', 'end', 'strand']].apply(get_tss, axis = 1)
138    gene_anno.insert(1, column = 'tss', value = tss)
139
140    # Calculating start and end positions based on the tss
141    adj_regions = gene_anno[['tss', 'strand']].apply(lambda row:
142                                                     calc_range(
143                                                        row = row,
144                                                        len_up = len_up,
145                                                        len_down = len_down), 
146                                                   axis = 1, 
147                                                   result_type = 'expand'
148                                                   )
149    
150    gene_anno.loc[:, ['start', 'end']] = adj_regions.values
151    
152    return gene_anno

Takes a GTF file as a pd.DataFrame and adjusts start and end positions to represent promotor regions given user-defined upstream and downstream adjustments relative to the transcription start site.

Parameters

gene_anno (pd.DataFrame): A dataframe with columns ['chr', 'start', 'end', 'strand', 'gene_name'] created from a GTF file.
len_up (int): Number of base pairs upstream of the transcription start site the promotor region should be adjusted for.
len_down (int): Number base pairs downstream of the transcription start site the promotor region should be adjusted for.

Returns

gene_anno (pd.DataFrame): A dataframe where ['start', 'end'] columns represent the start and end positions of inferred promotor regions for each annotation.

def create_region_dicts(gene_anno: pandas.core.frame.DataFrame) -> dict: View Source

155def create_region_dicts(gene_anno : pd.DataFrame) -> dict:
156    """
157    Takes a GTF as a pd.DataFrame and returns dictionaries required for
158    region comparisons between gene_annotations and assay features.
159
160    Parameters
161    ----------
162    gene_anno : pd.DataFrame 
163        A dataframe with columns `['chr', 'start', 'end', 'strand', 
164        'gene_name']` created from a GTF file.
165    
166    Returns
167    -------
168    peak_gene_dict : dict
169        Keys are regions and values are genes.
170        
171    ga_regions : dict
172        Chromosomes are keys and regions are values.
173    """
174    peak_gene_dict = {}
175    ga_regions = {}
176
177    for _, anno in gene_anno.iterrows():
178
179        cur_chr = anno['chr']
180        cur_start = int(anno['start'])
181        cur_end = int(anno['end'])
182        cur_name = anno['gene_name']
183
184        peak_gene_dict[(cur_chr, cur_start, cur_end)] = cur_name
185
186        cur_region = np.array([[cur_start, cur_end]], dtype = int)
187        if cur_chr in ga_regions.keys():
188            ga_regions[cur_chr] = np.concatenate((ga_regions[cur_chr], 
189                                                  cur_region), axis = 0)
190        
191        else:
192            ga_regions[cur_chr] = cur_region
193
194    return peak_gene_dict, ga_regions

Takes a GTF as a pd.DataFrame and returns dictionaries required for region comparisons between gene_annotations and assay features.

Parameters

gene_anno (pd.DataFrame): A dataframe with columns ['chr', 'start', 'end', 'strand', 'gene_name'] created from a GTF file.

Returns

peak_gene_dict (dict): Keys are regions and values are genes.
ga_regions (dict): Chromosomes are keys and regions are values.

def create_feature_dict( feature_names: list | set | numpy.ndarray | pandas.core.series.Series) -> dict: View Source

197def create_feature_dict(feature_names : list | set | np.ndarray | pd.Series
198                         ) -> dict:
199    """
200    Takes an array of feature names and returns data formatted for
201    feature comparisons.
202    
203    Returns
204    -------
205    feature_names : array_like
206        Array of region names from single-cell epigenetic data matrix.
207    
208    Returns
209    -------
210    feature_dict : dict
211        Keys are chromosomes and values are regions.
212    """
213    feature_dict = {}
214    feature_names = [re.split(":|-", peak) for peak in feature_names]
215    for peak_set in feature_names:
216        if peak_set[0] in feature_dict.keys():
217            feature_dict[peak_set[0]] = np.concatenate(
218                                                (feature_dict[peak_set[0]],
219                                                    np.array([[peak_set[1], 
220                                                                peak_set[2]]], 
221                                                                dtype = int)), 
222                                                                axis = 0)
223        else:
224            feature_dict[peak_set[0]] = np.array([[peak_set[1], 
225                                                   peak_set[2]]], 
226                                                   dtype = int)
227
228    return feature_dict

Takes an array of feature names and returns data formatted for feature comparisons.

Returns

feature_names (array_like): Array of region names from single-cell epigenetic data matrix.

Returns

feature_dict (dict): Keys are chromosomes and values are regions.

def compare_regions( feature_dict: dict, ga_regions: dict, peak_gene_dict: dict, gene_sets: dict, chr_sep: str) -> dict: View Source

231def compare_regions(feature_dict : dict, ga_regions : dict,
232                     peak_gene_dict : dict, gene_sets : dict, chr_sep : str
233                     ) -> dict:
234    """
235    Takes features from a single-cell data matrix and regions from
236    a gene annotation file to return an region grouping where regions 
237    from feature_dict and regions from gene annotations overlap.
238
239    Parameters
240    ----------
241    feature_dict : dict
242        Keys are chromosomes and values are regions. This data should 
243        come from a single-cell experiment.
244
245    ga_regions : dict
246        Keys are chromosomes and values are regions. This data should 
247        come from a gene annotations (gtf) file.
248
249    peak_gene_dict : dict
250        Keys are peaks from gene annotation file and values are the 
251        gene they are associated with.
252
253    gene_sets : dict
254        Keys are gene set names and values are an iterable object of 
255        gene names.
256
257    chr_sep : str
258        The character that separates the chromosome from the rest of 
259        the region in the original feature array.
260    
261    Returns
262    -------
263    epi_grouping : dict
264        Keys are the names from gene_sets and values are a list of 
265        regions from `feature_names` that overlap with promotor regions 
266        respective to genes in `gene_sets` (i.e., if region in 
267        `feature_names` overlaps with promotor region from a gene in a 
268        gene set from `gene_sets`, that region will be added to the new 
269        dictionary under the respective gene set name).
270    """
271    epi_grouping = {group : [] for group in gene_sets.keys()}
272
273    for chrom in feature_dict.keys():
274        if chrom not in ga_regions.keys():
275            continue
276        for region in feature_dict[chrom]:
277            for anno in ga_regions[chrom]:
278                if _find_overlap(region[0], region[1], anno[0], anno[1]):
279                    gene = peak_gene_dict[(chrom, anno[0], anno[1])]
280                    for group in gene_sets.keys():
281                        if gene in gene_sets[group]:
282                            feat_region = ''.join((chrom, chr_sep, 
283                                                  str(region[0]), "-", 
284                                                  str(region[1])))
285                            epi_grouping[group].append(feat_region)
286
287    return epi_grouping

Takes features from a single-cell data matrix and regions from a gene annotation file to return an region grouping where regions from feature_dict and regions from gene annotations overlap.

Parameters

feature_dict (dict): Keys are chromosomes and values are regions. This data should come from a single-cell experiment.
ga_regions (dict): Keys are chromosomes and values are regions. This data should come from a gene annotations (gtf) file.
peak_gene_dict (dict): Keys are peaks from gene annotation file and values are the gene they are associated with.
gene_sets (dict): Keys are gene set names and values are an iterable object of gene names.
chr_sep (str): The character that separates the chromosome from the rest of the region in the original feature array.

Returns

epi_grouping (dict): Keys are the names from gene_sets and values are a list of regions from feature_names that overlap with promotor regions respective to genes in gene_sets (i.e., if region in feature_names overlaps with promotor region from a gene in a gene set from gene_sets, that region will be added to the new dictionary under the respective gene set name).

def get_region_groupings( gene_anno: pandas.core.frame.DataFrame, gene_sets: dict, feature_names: numpy.ndarray | pandas.core.series.Series | list | set, len_up: int = 5000, len_down: int = 5000) -> dict: View Source

290def get_region_groupings(gene_anno : pd.DataFrame, gene_sets : dict, 
291                       feature_names : np.ndarray | pd.Series | list | set,
292                       len_up : int = 5000, len_down : int = 5000) -> dict:
293    """
294    Creates a peak set where keys are gene set names from `gene_sets` 
295    and values are arrays of features pulled from `feature_names`. 
296    Features are added to each peak set given overlap between regions 
297    in single-cell data matrix and inferred gene promoter regions in 
298    `gene_anno`.
299
300    Parameters
301    ----------
302    gene_anno : pd.DataFrame
303        Gene annotations in GTF format as a pd.DataFrame with columns
304        `['chr', 'start', 'end', 'strand', 'gene_name']`.
305
306    gene_sets : dict
307        Gene set names as keys and an iterable object of gene names
308        as values.
309
310    feature_names : array_like | set
311        Feature names corresponding to a single_cell epigenetic data 
312        matrix.
313
314    Returns
315    -------
316    epi_grouping : dict
317        Keys are the names from `gene_sets` and values
318        are a list of regions from `feature_names` that overlap with 
319        promotor regions respective to genes in `gene_sets` (i.e., if 
320        region in `feature_names` overlaps with promotor region from a 
321        gene in a gene set from `gene_sets`, that region will be added 
322        to the new dictionary under the respective gene set name).
323
324    Examples
325    --------
326    >>> # Reading in a gene set and the peak names from dataset
327    >>> gene_sets = np.load("data/RNA_hallmark_groupings.pkl", 
328    ...                     allow_pickle = True)
329    >>> peaks = np.load("data/MCF7_region_names.npy", 
330    ...                 allow_pickle = True)
331    >>> 
332    >>> # Reading in GTF file
333    >>> gtf_path = "data/hg38_subset_protein_coding.annotation.gtf"
334    >>> gtf = scmkl.read_gtf(gtf_path, filter_to_coding=True)
335    >>>
336    >>> region_grouping = scmkl.get_region_groupings(gene_anno = gtf,
337    ...                                         gene_sets = gene_sets,
338    ...                                         feature_names = peaks)
339    >>>
340    >>> region_grouping.keys()
341    dict_keys(['HALLMARK_TNFA_SIGNALING_VIA_NFKB', ...])
342    """
343    # Getting a unique set of gene names from gene_sets
344    all_genes = {gene for group in gene_sets.keys()
345                 for gene in gene_sets[group]}
346    
347    # Filtering out NaN values
348    all_genes = [gene for gene in all_genes if type(gene) != float]
349
350    # Filtering out annotations for genes not present in gene_sets
351    gene_anno = gene_anno[np.isin(gene_anno['gene_name'], all_genes)]
352
353    # Adjusting start and end columns to represent promotor regions
354    gene_anno = adjust_regions(gene_anno = gene_anno, 
355                                len_up = len_up, len_down = len_down)
356
357    # Creating a dictionary from assay features where [chr] : (start, end)
358    feature_dict = create_feature_dict(feature_names)
359
360    # Creating data structures from gene_anno for comparing regions
361    peak_gene_dict, ga_regions = create_region_dicts(gene_anno)
362
363    # Capturing the separator type used in assay
364    chr_sep = ':' if ':' in feature_names[0] else '-'
365
366    epi_groupings = compare_regions(feature_dict = feature_dict,
367                                     ga_regions = ga_regions,
368                                     peak_gene_dict = peak_gene_dict,
369                                     gene_sets = gene_sets,
370                                     chr_sep = chr_sep)
371    
372    return epi_groupings

Creates a peak set where keys are gene set names from gene_sets and values are arrays of features pulled from feature_names. Features are added to each peak set given overlap between regions in single-cell data matrix and inferred gene promoter regions in gene_anno.

Parameters

gene_anno (pd.DataFrame): Gene annotations in GTF format as a pd.DataFrame with columns ['chr', 'start', 'end', 'strand', 'gene_name'].
gene_sets (dict): Gene set names as keys and an iterable object of gene names as values.
feature_names (array_like | set): Feature names corresponding to a single_cell epigenetic data matrix.

Returns

epi_grouping (dict): Keys are the names from gene_sets and values are a list of regions from feature_names that overlap with promotor regions respective to genes in gene_sets (i.e., if region in feature_names overlaps with promotor region from a gene in a gene set from gene_sets, that region will be added to the new dictionary under the respective gene set name).

Examples

>>> # Reading in a gene set and the peak names from dataset
>>> gene_sets = np.load("data/RNA_hallmark_groupings.pkl", 
...                     allow_pickle = True)
>>> peaks = np.load("data/MCF7_region_names.npy", 
...                 allow_pickle = True)
>>> 
>>> # Reading in GTF file
>>> gtf_path = "data/hg38_subset_protein_coding.annotation.gtf"
>>> gtf = scmkl.read_gtf(gtf_path, filter_to_coding=True)
>>>
>>> region_grouping = scmkl.get_region_groupings(gene_anno = gtf,
...                                         gene_sets = gene_sets,
...                                         feature_names = peaks)
>>>
>>> region_grouping.keys()
dict_keys(['HALLMARK_TNFA_SIGNALING_VIA_NFKB', ...])