scmkl.get_atac_groupings
1import pandas as pd 2import numpy as np 3import re 4 5 6def _find_overlap(start1 : int, end1 : int, start2 : int, end2 : int) -> bool: 7 ''' 8 Function to determine whether two regions on the same chromosome 9 overlap. 10 11 Parameters 12 ---------- 13 start1 : the start position for region 1 14 end1 : the end position for region 1 15 start2 : the start position for region 2 16 end2: the end postion for region 2 17 18 Output 19 ------ 20 True if the regions overlap by 1bp 21 False if the regions do not overlap 22 ''' 23 return max(start1,start2) <= min(end1, end2) 24 25 26def _get_tss(row : pd.DataFrame) -> int: 27 ''' 28 Takes a row from a DataFrame as a DataFrame with columns 29 ['start', 'end', 'strand'] and returns the transcription start site 30 depending on gene strandedness. 31 32 Parameters 33 ---------- 34 row : a pd.DataFrame from a gtf as a pd.DataFrame containing 35 only one row and columns ['start', 'end', 'strand']. 36 37 Returns 38 ------- 39 The transcription start site for row's respective annotation. 40 ''' 41 if row.iloc[2] == '+': 42 return row.iloc[0] 43 44 elif row.iloc[2] == '-': 45 return row.iloc[1] 46 47 else: 48 raise ValueError('Strand symbol must be "+" or "-"') 49 50 51def _calc_range(row : pd.DataFrame, len_up : int, len_down : int) -> list: 52 ''' 53 Returns an infered promotor region for given annotation range 54 depending on transcription start site and user-defined upstream and 55 downstream adjustments. 56 57 Parameters 58 ---------- 59 row : a pd.DataFrame from a gtf as a pd.DataFrame containing 60 only one row and columns ['tss', 'strand'] where 'tss' is 61 the transcription start site. 62 len_up : how many base pairs upstream from the transcription 63 start site the promotor range should be adjusted for. 64 len_down : how many base pairs downstream from the 65 transcription start site the promotor range should be 66 adjusted for. 67 68 Returns 69 ------- 70 A 2 element list where the first element is the adjusted start 71 position and the second the the adjusted end position. 72 ''' 73 if row.iloc[1] == '+': 74 start = row.iloc[0] - len_up 75 end = row.iloc[0] + len_down 76 77 elif row.iloc[1] == '-': 78 start = row.iloc[0] - len_down 79 end = row.iloc[0] + len_up 80 81 else: 82 raise ValueError('Strand symbol must be "+" or "-"') 83 84 return [start, end] 85 86 87def _adjust_regions(gene_anno : pd.DataFrame, len_up : int, len_down : int 88 ) -> pd.DataFrame: 89 ''' 90 Takes a GTF file as a pd.DataFrame and adjusts start and end 91 positions to represent promotor regions given user-defined 92 upstream and downstream adjustments relative to the transcription 93 start site. 94 95 Parameters 96 ---------- 97 gene_anno : a pd.DataFrame with columns ['chr', 'start', 'end', 98 'strand', 'gene_name'] created from a GTF file. 99 len_up : an int for how many base pairs upstream of the 100 transcription start site the promotor region should be 101 adjusted for. 102 len_down : an int for how many base pairs downstream of the 103 transcription start site the promotor region should be 104 adjusted for. 105 106 Returns 107 ------- 108 gene_anno as a pd.DataFrame where ['start', 'end'] columns 109 represent the start and end positions of inferred promotor 110 regions for each annotation. 111 ''' 112 # Subsetting DataFrame to only required data 113 required_cols = ['chr', 'start', 'end', 'strand', 'gene_name'] 114 gene_anno = gene_anno[required_cols] 115 116 # Adding a column that annotates transcription start site based on strand 117 tss = gene_anno[['start', 'end', 'strand']].apply(_get_tss, axis = 1) 118 gene_anno.insert(1, column = 'tss', value = tss) 119 120 # Calculating start and end positions based on the tss 121 adj_regions = gene_anno[['tss', 'strand']].apply(lambda row: 122 _calc_range( 123 row = row, 124 len_up = len_up, 125 len_down = len_down), 126 axis = 1, 127 result_type = 'expand' 128 ) 129 130 gene_anno.loc[:, ['start', 'end']] = adj_regions.values 131 132 return gene_anno 133 134 135def _create_region_dicts(gene_anno : pd.DataFrame) -> dict: 136 ''' 137 Takes a GTF as a pd.DataFrame and returns dictionaries required for 138 region comparisons between gene_annotations and assay features. 139 140 Parameters 141 ---------- 142 gene_anno : a pd.DataFrame with columns ['chr', 'start', 'end', 143 'strand', 'gene_name'] created from a GTF file. 144 145 Returns 146 ------- 147 peak_gene_dict : dictionary where keys are regions and 148 values are genes. 149 ga_regions : dictionary where chromosomes are keys and regions are 150 values. 151 ''' 152 peak_gene_dict = {} 153 ga_regions = {} 154 155 for i, anno in gene_anno.iterrows(): 156 157 cur_chr = anno['chr'] 158 cur_start = int(anno['start']) 159 cur_end = int(anno['end']) 160 cur_name = anno['gene_name'] 161 162 peak_gene_dict[(cur_chr, cur_start, cur_end)] = cur_name 163 164 cur_region = np.array([[cur_start, cur_end]], dtype = int) 165 if cur_chr in ga_regions.keys(): 166 ga_regions[cur_chr] = np.concatenate((ga_regions[cur_chr], 167 cur_region), axis = 0) 168 169 else: 170 ga_regions[cur_chr] = cur_region 171 172 return peak_gene_dict, ga_regions 173 174 175def _create_feature_dict(feature_names : list | set | np.ndarray | pd.Series 176 ) -> dict: 177 ''' 178 Takes an array of feature names and returns data formatted for 179 feature comparisons. 180 181 Returns 182 ------- 183 feature_names : an iterable object of region names from single-cell 184 ATAC data matrix. 185 186 Returns 187 ------- 188 A dictionary where keys are chromosomes and values are regions. 189 ''' 190 feature_dict = {} 191 feature_names = [re.split(":|-", peak) for peak in feature_names] 192 for peak_set in feature_names: 193 if peak_set[0] in feature_dict.keys(): 194 feature_dict[peak_set[0]] = np.concatenate( 195 (feature_dict[peak_set[0]], 196 np.array([[peak_set[1], 197 peak_set[2]]], 198 dtype = int)), 199 axis = 0) 200 else: 201 feature_dict[peak_set[0]] = np.array([[peak_set[1], 202 peak_set[2]]], 203 dtype = int) 204 205 return feature_dict 206 207 208def _compare_regions(feature_dict : dict, ga_regions : dict, 209 peak_gene_dict : dict, gene_sets : dict, chr_sep : str 210 ) -> dict: 211 ''' 212 Takes features from a single-cell ATAC data matrix and regions from 213 a gene annotation file to return an ATAC grouping where regions 214 from feature_dict and regions from gene annotations overlap. 215 216 Parameters 217 ---------- 218 feature_dict : a dictionary where keys are chromosomes and 219 values are regions. This data should come from a 220 single-cell atac experiment. 221 ga_regions : a dictionary where keys are chromosomes and values 222 are regions. This data should come from a gene 223 annotations file. 224 peak_gene_dict : a dictionary where keys are peaks from gene 225 annotation file and values are the gene they are 226 associated with. 227 gene_sets : a dictionary where keys are gene set names and values 228 are an iterable object of gene names. 229 chr_sep : The character that separates the chromosome from the 230 rest of the region in the original feature array. 231 232 Returns 233 ------- 234 A dictionary where keys are the names from gene_sets and values 235 are a list of regions from feature_names that overlap with 236 promotor regions respective to genes in gene sets (i.e., if 237 ATAC feature in feature_names overlaps with promotor region 238 from a gene in a gene set from gene_sets, that region will be 239 added to the new dictionary under the respective gene set 240 name). 241 ''' 242 atac_grouping = {group : [] for group in gene_sets.keys()} 243 244 for chrom in feature_dict.keys(): 245 if chrom not in ga_regions.keys(): 246 continue 247 for region in feature_dict[chrom]: 248 for anno in ga_regions[chrom]: 249 if _find_overlap(region[0], region[1], anno[0], anno[1]): 250 gene = peak_gene_dict[(chrom, anno[0], anno[1])] 251 for group in gene_sets.keys(): 252 if gene in gene_sets[group]: 253 feat_region = ''.join(chrom, chr_sep, 254 str(region[0]), "-", 255 str(region[1])) 256 atac_grouping[group].append(feat_region) 257 258 return atac_grouping 259 260 261def get_atac_groupings(gene_anno : pd.DataFrame, gene_sets : dict, 262 feature_names : np.ndarray | pd.Series | list | set, 263 len_up : int = 5000, len_down : int = 5000) -> dict: 264 ''' 265 Creates a peak set where keys are gene set names from `gene_sets` 266 and values are arrays of features pulled from `feature_names`. 267 Features are added to each peak set given overlap between regions 268 in single-cell data matrix and inferred gene promoter regions in 269 `gene_anno`. 270 271 Parameters 272 ---------- 273 **gene_anno** : *pd.DataFrame* 274 > Gene annotations in GTF format as a pd.DataFrame with columns 275 ['chr', 'start', 'end', 'strand', 'gene_name']. 276 277 **gene_sets** : *dict* 278 > Gene set names as keys and an iterable object of gene names 279 as values. 280 281 **feature_names** : *np.ndarray* | *pd.Series* | *list* | *set* 282 > Feature names corresponding to a single_cell ATAC data 283 matrix. 284 285 Returns 286 ------- 287 **atac_grouping** : *dict* 288 > Keys are the names from `gene_sets` and values 289 are a list of regions from `feature_names` that overlap with 290 promotor regions respective to genes in `gene_sets` (i.e., if 291 ATAC feature in `feature_names` overlaps with promotor region 292 from a gene in a gene set from `gene_sets`, that region will be 293 added to the new dictionary under the respective gene set 294 name). 295 296 Examples 297 -------- 298 >>> # Reading in a gene set and the peak names from scATAC dataset 299 >>> gene_sets = np.load("data/RNA_hallmark_groupings.pkl", 300 ... allow_pickle = True) 301 >>> peaks = np.load("data/MCF7_ATAC_feature_names.npy", 302 ... allow_pickle = True) 303 >>> 304 >>> # Reading in GTF file 305 >>> gtf_path = "data/hg38_subset_protein_coding.annotation.gtf" 306 >>> gtf = pd.read_csv(gtf_path, sep = "\t", header = None, 307 ... skip_blank_lines = True, comment = "#") 308 >>> 309 >>> # Naming columns 310 >>> gtf.columns = ['chr', 'source', 'feature', 'start', 'end', 311 ... 'score', 'strand', 'frame', 'attribute'] 312 >>> 313 >>> # Subsetting to only protein coding genes 314 >>> prot_rows = gtf['attribute'].str.contains('protein_coding') 315 >>> gtf = gtf[prot_rows] 316 >>> gtf = gtf[gtf['feature'] == 'gene'] 317 >>> 318 >>> # Capturing gene name from attributes column 319 >>> gtf['gene_name'] = [re.findall(r'(?<=gene_name ")[A-z0-9]+', 320 ... attr)[0] 321 ... for attr in gtf['attribute']] 322 >>> 323 >>> atac_grouping = scmkl.get_atac_groupings(gene_anno = gtf, 324 ... gene_sets = gene_sets, 325 ... feature_names = peaks) 326 >>> 327 >>> atac_grouping.keys() 328 dict_keys(['HALLMARK_TNFA_SIGNALING_VIA_NFKB', ...]) 329 ''' 330 # Getting a unique set of gene names from gene_sets 331 all_genes = {gene for group in gene_sets.keys() 332 for gene in gene_sets[group]} 333 334 # Filtering out NaN values 335 all_genes = [gene for gene in all_genes if type(gene) != float] 336 337 # Filtering out annotations for genes not present in gene_sets 338 gene_anno = gene_anno[np.isin(gene_anno['gene_name'], all_genes)] 339 340 # Adjusting start and end columns to represent promotor regions 341 gene_anno = _adjust_regions(gene_anno = gene_anno, 342 len_up = len_up, len_down = len_down) 343 344 # Creating a dictionary from assay features where [chr] : (start, end) 345 feature_dict = _create_feature_dict(feature_names) 346 347 # Creating data structures from gene_anno for comparing regions 348 peak_gene_dict, ga_regions = _create_region_dicts(gene_anno) 349 350 # Capturing the separator type used in assay 351 chr_sep = ':' if ':' in feature_names[0] else '-' 352 353 atac_groupings = _compare_regions(feature_dict = feature_dict, 354 ga_regions = ga_regions, 355 peak_gene_dict = peak_gene_dict, 356 gene_sets = gene_sets, 357 chr_sep = chr_sep) 358 359 return atac_groupings
262def get_atac_groupings(gene_anno : pd.DataFrame, gene_sets : dict, 263 feature_names : np.ndarray | pd.Series | list | set, 264 len_up : int = 5000, len_down : int = 5000) -> dict: 265 ''' 266 Creates a peak set where keys are gene set names from `gene_sets` 267 and values are arrays of features pulled from `feature_names`. 268 Features are added to each peak set given overlap between regions 269 in single-cell data matrix and inferred gene promoter regions in 270 `gene_anno`. 271 272 Parameters 273 ---------- 274 **gene_anno** : *pd.DataFrame* 275 > Gene annotations in GTF format as a pd.DataFrame with columns 276 ['chr', 'start', 'end', 'strand', 'gene_name']. 277 278 **gene_sets** : *dict* 279 > Gene set names as keys and an iterable object of gene names 280 as values. 281 282 **feature_names** : *np.ndarray* | *pd.Series* | *list* | *set* 283 > Feature names corresponding to a single_cell ATAC data 284 matrix. 285 286 Returns 287 ------- 288 **atac_grouping** : *dict* 289 > Keys are the names from `gene_sets` and values 290 are a list of regions from `feature_names` that overlap with 291 promotor regions respective to genes in `gene_sets` (i.e., if 292 ATAC feature in `feature_names` overlaps with promotor region 293 from a gene in a gene set from `gene_sets`, that region will be 294 added to the new dictionary under the respective gene set 295 name). 296 297 Examples 298 -------- 299 >>> # Reading in a gene set and the peak names from scATAC dataset 300 >>> gene_sets = np.load("data/RNA_hallmark_groupings.pkl", 301 ... allow_pickle = True) 302 >>> peaks = np.load("data/MCF7_ATAC_feature_names.npy", 303 ... allow_pickle = True) 304 >>> 305 >>> # Reading in GTF file 306 >>> gtf_path = "data/hg38_subset_protein_coding.annotation.gtf" 307 >>> gtf = pd.read_csv(gtf_path, sep = "\t", header = None, 308 ... skip_blank_lines = True, comment = "#") 309 >>> 310 >>> # Naming columns 311 >>> gtf.columns = ['chr', 'source', 'feature', 'start', 'end', 312 ... 'score', 'strand', 'frame', 'attribute'] 313 >>> 314 >>> # Subsetting to only protein coding genes 315 >>> prot_rows = gtf['attribute'].str.contains('protein_coding') 316 >>> gtf = gtf[prot_rows] 317 >>> gtf = gtf[gtf['feature'] == 'gene'] 318 >>> 319 >>> # Capturing gene name from attributes column 320 >>> gtf['gene_name'] = [re.findall(r'(?<=gene_name ")[A-z0-9]+', 321 ... attr)[0] 322 ... for attr in gtf['attribute']] 323 >>> 324 >>> atac_grouping = scmkl.get_atac_groupings(gene_anno = gtf, 325 ... gene_sets = gene_sets, 326 ... feature_names = peaks) 327 >>> 328 >>> atac_grouping.keys() 329 dict_keys(['HALLMARK_TNFA_SIGNALING_VIA_NFKB', ...]) 330 ''' 331 # Getting a unique set of gene names from gene_sets 332 all_genes = {gene for group in gene_sets.keys() 333 for gene in gene_sets[group]} 334 335 # Filtering out NaN values 336 all_genes = [gene for gene in all_genes if type(gene) != float] 337 338 # Filtering out annotations for genes not present in gene_sets 339 gene_anno = gene_anno[np.isin(gene_anno['gene_name'], all_genes)] 340 341 # Adjusting start and end columns to represent promotor regions 342 gene_anno = _adjust_regions(gene_anno = gene_anno, 343 len_up = len_up, len_down = len_down) 344 345 # Creating a dictionary from assay features where [chr] : (start, end) 346 feature_dict = _create_feature_dict(feature_names) 347 348 # Creating data structures from gene_anno for comparing regions 349 peak_gene_dict, ga_regions = _create_region_dicts(gene_anno) 350 351 # Capturing the separator type used in assay 352 chr_sep = ':' if ':' in feature_names[0] else '-' 353 354 atac_groupings = _compare_regions(feature_dict = feature_dict, 355 ga_regions = ga_regions, 356 peak_gene_dict = peak_gene_dict, 357 gene_sets = gene_sets, 358 chr_sep = chr_sep) 359 360 return atac_groupings
Creates a peak set where keys are gene set names from gene_sets
and values are arrays of features pulled from feature_names
.
Features are added to each peak set given overlap between regions
in single-cell data matrix and inferred gene promoter regions in
gene_anno
.
Parameters
gene_anno : pd.DataFrame
Gene annotations in GTF format as a pd.DataFrame with columns ['chr', 'start', 'end', 'strand', 'gene_name'].
gene_sets : dict
Gene set names as keys and an iterable object of gene names as values.
feature_names : np.ndarray | pd.Series | list | set
Feature names corresponding to a single_cell ATAC data matrix.
Returns
atac_grouping : dict
Keys are the names from
gene_sets
and values are a list of regions fromfeature_names
that overlap with promotor regions respective to genes ingene_sets
(i.e., if ATAC feature infeature_names
overlaps with promotor region from a gene in a gene set fromgene_sets
, that region will be added to the new dictionary under the respective gene set name).
Examples
>>> # Reading in a gene set and the peak names from scATAC dataset
>>> gene_sets = np.load("data/RNA_hallmark_groupings.pkl",
... allow_pickle = True)
>>> peaks = np.load("data/MCF7_ATAC_feature_names.npy",
... allow_pickle = True)
>>>
>>> # Reading in GTF file
>>> gtf_path = "data/hg38_subset_protein_coding.annotation.gtf"
>>> gtf = pd.read_csv(gtf_path, sep = " ", header = None,
... skip_blank_lines = True, comment = "#")
>>>
>>> # Naming columns
>>> gtf.columns = ['chr', 'source', 'feature', 'start', 'end',
... 'score', 'strand', 'frame', 'attribute']
>>>
>>> # Subsetting to only protein coding genes
>>> prot_rows = gtf['attribute'].str.contains('protein_coding')
>>> gtf = gtf[prot_rows]
>>> gtf = gtf[gtf['feature'] == 'gene']
>>>
>>> # Capturing gene name from attributes column
>>> gtf['gene_name'] = [re.findall(r'(?<=gene_name ")[A-z0-9]+',
... attr)[0]
... for attr in gtf['attribute']]
>>>
>>> atac_grouping = scmkl.get_atac_groupings(gene_anno = gtf,
... gene_sets = gene_sets,
... feature_names = peaks)
>>>
>>> atac_grouping.keys()
dict_keys(['HALLMARK_TNFA_SIGNALING_VIA_NFKB', ...])