scmkl.get_atac_groupings

  1import pandas as pd
  2import numpy as np
  3import re
  4
  5
  6def _find_overlap(start1 : int, end1 : int, start2 : int, end2 : int) -> bool:
  7    '''
  8    Function to determine whether two regions on the same chromosome 
  9    overlap.
 10    
 11    Parameters
 12    ----------
 13    start1 : the start position for region 1
 14    end1 : the end position for region 1
 15    start2 : the start position for region 2
 16    end2: the end postion for region 2
 17    
 18    Output
 19    ------
 20    True if the regions overlap by 1bp
 21    False if the regions do not overlap
 22    '''
 23    return max(start1,start2) <= min(end1, end2)
 24
 25
 26def _get_tss(row : pd.DataFrame) -> int:
 27    '''
 28    Takes a row from a DataFrame as a DataFrame with columns 
 29    ['start', 'end', 'strand'] and returns the transcription start site
 30    depending on gene strandedness.
 31    
 32    Parameters
 33    ----------
 34    row : a pd.DataFrame from a gtf as a pd.DataFrame containing
 35          only one row and columns ['start', 'end', 'strand'].
 36    
 37    Returns
 38    -------
 39    The transcription start site for row's respective annotation.
 40    '''
 41    if row.iloc[2] == '+':
 42        return row.iloc[0]
 43    
 44    elif row.iloc[2] == '-':
 45        return row.iloc[1]
 46    
 47    else:
 48        raise ValueError('Strand symbol must be "+" or "-"')
 49    
 50
 51def _calc_range(row : pd.DataFrame, len_up : int, len_down : int) -> list:
 52    '''
 53    Returns an infered promotor region for given annotation range 
 54    depending on transcription start site and user-defined upstream and
 55    downstream adjustments.
 56
 57    Parameters
 58    ----------
 59    row : a pd.DataFrame from a gtf as a pd.DataFrame containing
 60          only one row and columns ['tss', 'strand'] where 'tss' is
 61          the transcription start site.
 62    len_up : how many base pairs upstream from the transcription 
 63             start site the promotor range should be adjusted for.
 64    len_down : how many base pairs downstream from the 
 65               transcription start site the promotor range should be 
 66               adjusted for.
 67    
 68    Returns
 69    -------
 70    A 2 element list where the first element is the adjusted start
 71    position and the second the the adjusted end position.
 72    '''
 73    if row.iloc[1] == '+':
 74        start = row.iloc[0] - len_up
 75        end = row.iloc[0] + len_down
 76
 77    elif row.iloc[1] == '-':
 78        start = row.iloc[0] - len_down
 79        end = row.iloc[0] + len_up
 80
 81    else:
 82        raise ValueError('Strand symbol must be "+" or "-"')
 83    
 84    return [start, end]
 85
 86
 87def _adjust_regions(gene_anno : pd.DataFrame, len_up : int, len_down : int
 88                    ) -> pd.DataFrame:
 89    '''
 90    Takes a GTF file as a pd.DataFrame and adjusts start and end 
 91    positions to represent promotor regions given user-defined 
 92    upstream and downstream adjustments relative to the transcription
 93    start site.
 94
 95    Parameters
 96    ----------
 97    gene_anno : a pd.DataFrame with columns ['chr', 'start', 'end', 
 98                'strand', 'gene_name'] created from a GTF file.
 99    len_up : an int for how many base pairs upstream of the 
100             transcription start site the promotor region should be 
101             adjusted for.
102    len_down : an int for how many base pairs downstream of the 
103               transcription start site the promotor region should be 
104               adjusted for.
105    
106    Returns
107    -------
108    gene_anno as a pd.DataFrame where ['start', 'end'] columns 
109    represent the start and end positions of inferred promotor
110    regions for each annotation.
111    '''
112    # Subsetting DataFrame to only required data
113    required_cols = ['chr', 'start', 'end', 'strand', 'gene_name']
114    gene_anno = gene_anno[required_cols]
115
116    # Adding a column that annotates transcription start site based on strand
117    tss = gene_anno[['start', 'end', 'strand']].apply(_get_tss, axis = 1)
118    gene_anno.insert(1, column = 'tss', value = tss)
119
120    # Calculating start and end positions based on the tss
121    adj_regions = gene_anno[['tss', 'strand']].apply(lambda row:
122                                                    _calc_range(
123                                                        row = row,
124                                                        len_up = len_up,
125                                                        len_down = len_down), 
126                                                   axis = 1, 
127                                                   result_type = 'expand'
128                                                   )
129    
130    gene_anno.loc[:, ['start', 'end']] = adj_regions.values
131    
132    return gene_anno
133
134
135def _create_region_dicts(gene_anno : pd.DataFrame) -> dict:
136    '''
137    Takes a GTF as a pd.DataFrame and returns dictionaries required for
138    region comparisons between gene_annotations and assay features.
139
140    Parameters
141    ----------
142    gene_anno : a pd.DataFrame with columns ['chr', 'start', 'end', 
143                'strand', 'gene_name'] created from a GTF file.
144    
145    Returns
146    -------
147    peak_gene_dict : dictionary where keys are regions and
148                     values are genes.
149    ga_regions : dictionary where chromosomes are keys and regions are 
150                 values.
151    '''
152    peak_gene_dict = {}
153    ga_regions = {}
154
155    for i, anno in gene_anno.iterrows():
156
157        cur_chr = anno['chr']
158        cur_start = int(anno['start'])
159        cur_end = int(anno['end'])
160        cur_name = anno['gene_name']
161
162        peak_gene_dict[(cur_chr, cur_start, cur_end)] = cur_name
163
164        cur_region = np.array([[cur_start, cur_end]], dtype = int)
165        if cur_chr in ga_regions.keys():
166            ga_regions[cur_chr] = np.concatenate((ga_regions[cur_chr], 
167                                                  cur_region), axis = 0)
168        
169        else:
170            ga_regions[cur_chr] = cur_region
171
172    return peak_gene_dict, ga_regions
173
174
175def _create_feature_dict(feature_names : list | set | np.ndarray | pd.Series
176                         ) -> dict:
177    '''
178    Takes an array of feature names and returns data formatted for
179    feature comparisons.
180    
181    Returns
182    -------
183    feature_names : an iterable object of region names from single-cell 
184                    ATAC data matrix.
185    
186    Returns
187    -------
188        A dictionary where keys are chromosomes and values are regions.
189    '''
190    feature_dict = {}
191    feature_names = [re.split(":|-", peak) for peak in feature_names]
192    for peak_set in feature_names:
193        if peak_set[0] in feature_dict.keys():
194            feature_dict[peak_set[0]] = np.concatenate(
195                                                (feature_dict[peak_set[0]],
196                                                    np.array([[peak_set[1], 
197                                                                peak_set[2]]], 
198                                                                dtype = int)), 
199                                                                axis = 0)
200        else:
201            feature_dict[peak_set[0]] = np.array([[peak_set[1], 
202                                                   peak_set[2]]], 
203                                                   dtype = int)
204
205    return feature_dict
206
207
208def _compare_regions(feature_dict : dict, ga_regions : dict,
209                     peak_gene_dict : dict, gene_sets : dict, chr_sep : str
210                     ) -> dict:
211    '''
212    Takes features from a single-cell ATAC data matrix and regions from
213    a gene annotation file to return an ATAC grouping where regions 
214    from feature_dict and regions from gene annotations overlap.
215
216    Parameters
217    ----------
218    feature_dict : a dictionary where keys are chromosomes and 
219                   values are regions. This data should come from a 
220                   single-cell atac experiment.
221    ga_regions : a dictionary where keys are chromosomes and values
222                 are regions. This data should come from a gene 
223                 annotations file.
224    peak_gene_dict : a dictionary where keys are peaks from gene 
225                     annotation file and values are the gene they are 
226                     associated with.
227    gene_sets : a dictionary where keys are gene set names and values 
228                are an iterable object of gene names.
229    chr_sep : The character that separates the chromosome from the 
230                rest of the region in the original feature array.
231    
232    Returns
233    -------
234    A dictionary where keys are the names from gene_sets and values
235    are a list of regions from feature_names that overlap with 
236    promotor regions respective to genes in gene sets (i.e., if 
237    ATAC feature in feature_names overlaps with promotor region 
238    from a gene in a gene set from gene_sets, that region will be
239    added to the new dictionary under the respective gene set 
240    name).
241    '''
242    atac_grouping = {group : [] for group in gene_sets.keys()}
243
244    for chrom in feature_dict.keys():
245        if chrom not in ga_regions.keys():
246            continue
247        for region in feature_dict[chrom]:
248            for anno in ga_regions[chrom]:
249                if _find_overlap(region[0], region[1], anno[0], anno[1]):
250                    gene = peak_gene_dict[(chrom, anno[0], anno[1])]
251                    for group in gene_sets.keys():
252                        if gene in gene_sets[group]:
253                            feat_region = ''.join(chrom, chr_sep, 
254                                                  str(region[0]), "-", 
255                                                  str(region[1]))
256                            atac_grouping[group].append(feat_region)
257
258    return atac_grouping
259
260
261def get_atac_groupings(gene_anno : pd.DataFrame, gene_sets : dict, 
262                       feature_names : np.ndarray | pd.Series | list | set,
263                       len_up : int = 5000, len_down : int = 5000) -> dict:
264    '''
265    Creates a peak set where keys are gene set names from `gene_sets` 
266    and values are arrays of features pulled from `feature_names`. 
267    Features are added to each peak set given overlap between regions 
268    in single-cell data matrix and inferred gene promoter regions in 
269    `gene_anno`.
270
271    Parameters
272    ----------
273    **gene_anno** : *pd.DataFrame*
274        > Gene annotations in GTF format as a pd.DataFrame with columns
275        ['chr', 'start', 'end', 'strand', 'gene_name'].
276
277    **gene_sets** : *dict*
278        > Gene set names as keys and an iterable object of gene names
279        as values.
280
281    **feature_names** : *np.ndarray* | *pd.Series* | *list* | *set*
282        > Feature names corresponding to a single_cell ATAC data 
283        matrix.
284
285    Returns
286    -------
287    **atac_grouping** : *dict*
288        > Keys are the names from `gene_sets` and values
289        are a list of regions from `feature_names` that overlap with 
290        promotor regions respective to genes in `gene_sets` (i.e., if 
291        ATAC feature in `feature_names` overlaps with promotor region 
292        from a gene in a gene set from `gene_sets`, that region will be
293        added to the new dictionary under the respective gene set 
294        name).
295
296    Examples
297    --------
298    >>> # Reading in a gene set and the peak names from scATAC dataset
299    >>> gene_sets = np.load("data/RNA_hallmark_groupings.pkl", 
300    ...                     allow_pickle = True)
301    >>> peaks = np.load("data/MCF7_ATAC_feature_names.npy", 
302    ...                 allow_pickle = True)
303    >>> 
304    >>> # Reading in GTF file
305    >>> gtf_path = "data/hg38_subset_protein_coding.annotation.gtf"
306    >>> gtf = pd.read_csv(gtf_path, sep = "\t", header = None, 
307    ...                   skip_blank_lines = True, comment = "#")
308    >>> 
309    >>> # Naming columns
310    >>> gtf.columns = ['chr', 'source', 'feature', 'start', 'end', 
311    ...                'score', 'strand', 'frame', 'attribute']
312    >>>
313    >>> # Subsetting to only protein coding genes
314    >>> prot_rows = gtf['attribute'].str.contains('protein_coding')
315    >>> gtf = gtf[prot_rows]
316    >>> gtf = gtf[gtf['feature'] == 'gene']
317    >>> 
318    >>> # Capturing gene name from attributes column
319    >>> gtf['gene_name'] = [re.findall(r'(?<=gene_name ")[A-z0-9]+', 
320    ...                                attr)[0] 
321    ...                     for attr in gtf['attribute']]
322    >>>
323    >>> atac_grouping = scmkl.get_atac_groupings(gene_anno = gtf,
324    ...                                         gene_sets = gene_sets,
325    ...                                         feature_names = peaks)
326    >>>
327    >>> atac_grouping.keys()
328    dict_keys(['HALLMARK_TNFA_SIGNALING_VIA_NFKB', ...])
329    '''
330    # Getting a unique set of gene names from gene_sets
331    all_genes = {gene for group in gene_sets.keys()
332                 for gene in gene_sets[group]}
333    
334    # Filtering out NaN values
335    all_genes = [gene for gene in all_genes if type(gene) != float]
336
337    # Filtering out annotations for genes not present in gene_sets
338    gene_anno = gene_anno[np.isin(gene_anno['gene_name'], all_genes)]
339
340    # Adjusting start and end columns to represent promotor regions
341    gene_anno = _adjust_regions(gene_anno = gene_anno, 
342                                len_up = len_up, len_down = len_down)
343
344    # Creating a dictionary from assay features where [chr] : (start, end)
345    feature_dict = _create_feature_dict(feature_names)
346
347    # Creating data structures from gene_anno for comparing regions
348    peak_gene_dict, ga_regions = _create_region_dicts(gene_anno)
349
350    # Capturing the separator type used in assay
351    chr_sep = ':' if ':' in feature_names[0] else '-'
352
353    atac_groupings = _compare_regions(feature_dict = feature_dict,
354                                     ga_regions = ga_regions,
355                                     peak_gene_dict = peak_gene_dict,
356                                     gene_sets = gene_sets,
357                                     chr_sep = chr_sep)
358    
359    return atac_groupings
def get_atac_groupings( gene_anno: pandas.core.frame.DataFrame, gene_sets: dict, feature_names: numpy.ndarray | pandas.core.series.Series | list | set, len_up: int = 5000, len_down: int = 5000) -> dict:
262def get_atac_groupings(gene_anno : pd.DataFrame, gene_sets : dict, 
263                       feature_names : np.ndarray | pd.Series | list | set,
264                       len_up : int = 5000, len_down : int = 5000) -> dict:
265    '''
266    Creates a peak set where keys are gene set names from `gene_sets` 
267    and values are arrays of features pulled from `feature_names`. 
268    Features are added to each peak set given overlap between regions 
269    in single-cell data matrix and inferred gene promoter regions in 
270    `gene_anno`.
271
272    Parameters
273    ----------
274    **gene_anno** : *pd.DataFrame*
275        > Gene annotations in GTF format as a pd.DataFrame with columns
276        ['chr', 'start', 'end', 'strand', 'gene_name'].
277
278    **gene_sets** : *dict*
279        > Gene set names as keys and an iterable object of gene names
280        as values.
281
282    **feature_names** : *np.ndarray* | *pd.Series* | *list* | *set*
283        > Feature names corresponding to a single_cell ATAC data 
284        matrix.
285
286    Returns
287    -------
288    **atac_grouping** : *dict*
289        > Keys are the names from `gene_sets` and values
290        are a list of regions from `feature_names` that overlap with 
291        promotor regions respective to genes in `gene_sets` (i.e., if 
292        ATAC feature in `feature_names` overlaps with promotor region 
293        from a gene in a gene set from `gene_sets`, that region will be
294        added to the new dictionary under the respective gene set 
295        name).
296
297    Examples
298    --------
299    >>> # Reading in a gene set and the peak names from scATAC dataset
300    >>> gene_sets = np.load("data/RNA_hallmark_groupings.pkl", 
301    ...                     allow_pickle = True)
302    >>> peaks = np.load("data/MCF7_ATAC_feature_names.npy", 
303    ...                 allow_pickle = True)
304    >>> 
305    >>> # Reading in GTF file
306    >>> gtf_path = "data/hg38_subset_protein_coding.annotation.gtf"
307    >>> gtf = pd.read_csv(gtf_path, sep = "\t", header = None, 
308    ...                   skip_blank_lines = True, comment = "#")
309    >>> 
310    >>> # Naming columns
311    >>> gtf.columns = ['chr', 'source', 'feature', 'start', 'end', 
312    ...                'score', 'strand', 'frame', 'attribute']
313    >>>
314    >>> # Subsetting to only protein coding genes
315    >>> prot_rows = gtf['attribute'].str.contains('protein_coding')
316    >>> gtf = gtf[prot_rows]
317    >>> gtf = gtf[gtf['feature'] == 'gene']
318    >>> 
319    >>> # Capturing gene name from attributes column
320    >>> gtf['gene_name'] = [re.findall(r'(?<=gene_name ")[A-z0-9]+', 
321    ...                                attr)[0] 
322    ...                     for attr in gtf['attribute']]
323    >>>
324    >>> atac_grouping = scmkl.get_atac_groupings(gene_anno = gtf,
325    ...                                         gene_sets = gene_sets,
326    ...                                         feature_names = peaks)
327    >>>
328    >>> atac_grouping.keys()
329    dict_keys(['HALLMARK_TNFA_SIGNALING_VIA_NFKB', ...])
330    '''
331    # Getting a unique set of gene names from gene_sets
332    all_genes = {gene for group in gene_sets.keys()
333                 for gene in gene_sets[group]}
334    
335    # Filtering out NaN values
336    all_genes = [gene for gene in all_genes if type(gene) != float]
337
338    # Filtering out annotations for genes not present in gene_sets
339    gene_anno = gene_anno[np.isin(gene_anno['gene_name'], all_genes)]
340
341    # Adjusting start and end columns to represent promotor regions
342    gene_anno = _adjust_regions(gene_anno = gene_anno, 
343                                len_up = len_up, len_down = len_down)
344
345    # Creating a dictionary from assay features where [chr] : (start, end)
346    feature_dict = _create_feature_dict(feature_names)
347
348    # Creating data structures from gene_anno for comparing regions
349    peak_gene_dict, ga_regions = _create_region_dicts(gene_anno)
350
351    # Capturing the separator type used in assay
352    chr_sep = ':' if ':' in feature_names[0] else '-'
353
354    atac_groupings = _compare_regions(feature_dict = feature_dict,
355                                     ga_regions = ga_regions,
356                                     peak_gene_dict = peak_gene_dict,
357                                     gene_sets = gene_sets,
358                                     chr_sep = chr_sep)
359    
360    return atac_groupings

Creates a peak set where keys are gene set names from gene_sets and values are arrays of features pulled from feature_names. Features are added to each peak set given overlap between regions in single-cell data matrix and inferred gene promoter regions in gene_anno.

Parameters

gene_anno : pd.DataFrame

Gene annotations in GTF format as a pd.DataFrame with columns ['chr', 'start', 'end', 'strand', 'gene_name'].

gene_sets : dict

Gene set names as keys and an iterable object of gene names as values.

feature_names : np.ndarray | pd.Series | list | set

Feature names corresponding to a single_cell ATAC data matrix.

Returns

atac_grouping : dict

Keys are the names from gene_sets and values are a list of regions from feature_names that overlap with promotor regions respective to genes in gene_sets (i.e., if ATAC feature in feature_names overlaps with promotor region from a gene in a gene set from gene_sets, that region will be added to the new dictionary under the respective gene set name).

Examples

>>> # Reading in a gene set and the peak names from scATAC dataset
>>> gene_sets = np.load("data/RNA_hallmark_groupings.pkl", 
...                     allow_pickle = True)
>>> peaks = np.load("data/MCF7_ATAC_feature_names.npy", 
...                 allow_pickle = True)
>>> 
>>> # Reading in GTF file
>>> gtf_path = "data/hg38_subset_protein_coding.annotation.gtf"
>>> gtf = pd.read_csv(gtf_path, sep = "     ", header = None, 
...                   skip_blank_lines = True, comment = "#")
>>> 
>>> # Naming columns
>>> gtf.columns = ['chr', 'source', 'feature', 'start', 'end', 
...                'score', 'strand', 'frame', 'attribute']
>>>
>>> # Subsetting to only protein coding genes
>>> prot_rows = gtf['attribute'].str.contains('protein_coding')
>>> gtf = gtf[prot_rows]
>>> gtf = gtf[gtf['feature'] == 'gene']
>>> 
>>> # Capturing gene name from attributes column
>>> gtf['gene_name'] = [re.findall(r'(?<=gene_name ")[A-z0-9]+', 
...                                attr)[0] 
...                     for attr in gtf['attribute']]
>>>
>>> atac_grouping = scmkl.get_atac_groupings(gene_anno = gtf,
...                                         gene_sets = gene_sets,
...                                         feature_names = peaks)
>>>
>>> atac_grouping.keys()
dict_keys(['HALLMARK_TNFA_SIGNALING_VIA_NFKB', ...])