scmkl.dataframes
1import os 2import re 3import numpy as np 4import pandas as pd 5 6 7def _parse_result_type(results : dict | None, rfiles : dict | None) -> bool: 8 ''' 9 This function simply returns a bool for whether or not there are 10 multiple runs present while checking that There is one dict and 11 one Nonetype between `results` and `rfiles`. 12 ''' 13 dtypes = (type(results), type(rfiles)) 14 none_in_dtypes = type(None) in dtypes 15 dict_in_dtypes = dict in dtypes 16 both_in_dtypes = none_in_dtypes and dict_in_dtypes 17 18 # Ensuring that at least one of dtypes is None 19 assert both_in_dtypes, "Only `rfiles` or `results` can be provided" 20 21 if type(rfiles) is dict: 22 mult_files = True 23 else: 24 mult_files = False 25 26 return mult_files 27 28 29def _parse_metrics(results, key : str | None = None, 30 include_as = False) -> pd.DataFrame: 31 ''' 32 This function returns a pd.DataFrame for a single scMKL result. 33 ''' 34 alpha_vals = [] 35 met_names = [] 36 met_vals = [] 37 38 # If statement ensuring results is a scMKL results with metrics 39 if 'Metrics' in results.keys(): 40 for alpha in results['Metrics'].keys(): 41 for metric, value in results['Metrics'][alpha].items(): 42 alpha_vals.append(alpha) 43 met_names.append(metric) 44 met_vals.append(value) 45 46 if include_as: 47 assert 'Alpha_star' in results.keys(), "'Alpha_star' not in results" 48 df['Alpha Star'] = df['Alpha'] == results['Alpha_star'] 49 50 else: 51 print(f"{key} is not a scMKL result and will be ignored.") 52 53 df = pd.DataFrame({'Alpha' : alpha_vals, 54 'Metric' : met_names, 55 'Value' : met_vals}) 56 57 if key is not None: 58 df['Key'] = [key] * df.shape[0] 59 60 return df 61 62 63def _parse_weights(results : dict, include_as : bool = False, 64 key : None | str = None) -> pd.DataFrame: 65 ''' 66 ''' 67 alpha_vals = [] 68 group_names = [] 69 kernel_weights = [] 70 71 for alpha in results['Norms'].keys(): 72 alpha_vals.extend([alpha] * len(results['Norms'][alpha])) 73 group_names.extend(results['Group_names']) 74 kernel_weights.extend(results['Norms'][alpha]) 75 76 df = pd.DataFrame({'Alpha' : alpha_vals, 77 'Group' : group_names, 78 'Kernel Weight' : kernel_weights}) 79 80 if include_as: 81 df['Alpha Star'] = df['Alpha'] == results['Alpha_star'] 82 83 if key is not None: 84 df['Key'] = [key] * df.shape[0] 85 86 return df 87 88 89def get_summary(results : dict, metric = 'AUROC'): 90 ''' 91 Takes the results from either `scmkl.run()` and generates a 92 dataframe for each model containing columns for alpha, area under 93 the ROC, number of groups with nonzero weights, and highest 94 weighted group. 95 96 Parameters 97 ---------- 98 **results** : *dict* 99 > A dictionary of results from scMKL generated from either 100 `scmkl.run()`. 101 102 **metric** : *str* 103 > Which metric to include in the summary. Default is AUROC. 104 Options include `'AUROC'`, `'Recall'`, `'Precision'`, 105 `'Accuracy'`, and `'F1-Score'`. 106 107 Returns 108 ------- 109 **summary_df** : *pd.DataFrame* 110 > A table with columns: 111 `['Alpha', 'AUROC', 'Number of Selected Groups', 'Top Group']`. 112 113 Examples 114 -------- 115 >>> results = scmkl.run(adata, alpha_list) 116 >>> summary_df = scmkl.get_summary(results) 117 ... 118 >>> summary_df.head() 119 Alpha AUROC Number of Selected Groups 120 0 2.20 0.8600 3 121 1 1.96 0.9123 4 122 2 1.72 0.9357 5 123 3 1.48 0.9524 7 124 4 1.24 0.9666 9 125 Top Group 126 0 RNA-HALLMARK_E2F_TARGETS 127 1 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY 128 2 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY 129 3 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY 130 4 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY 131 ''' 132 summary = {'Alpha' : [], 133 'AUROC' : [], 134 'Number of Selected Groups' : [], 135 'Top Group' : []} 136 137 alpha_list = list(results['Metrics'].keys()) 138 139 # Creating summary DataFrame for each model 140 for alpha in alpha_list: 141 cur_alpha_rows = results['Norms'][alpha] 142 top_weight_rows = np.max(results['Norms'][alpha]) 143 top_group_index = np.where(cur_alpha_rows == top_weight_rows) 144 num_selected = len(results['Selected_groups'][alpha]) 145 top_group_names = np.array(results['Group_names'])[top_group_index] 146 147 summary['Alpha'].append(alpha) 148 summary['AUROC'].append(results['Metrics'][alpha][metric]) 149 summary['Number of Selected Groups'].append(num_selected) 150 summary['Top Group'].append(*top_group_names) 151 152 summary = pd.DataFrame(summary) 153 154 return summary 155 156 157def read_files(dir : str, pattern : str | None = None) -> dict: 158 ''' 159 This function takes a directory of scMKL results as pickle files 160 and returns a dictionary with the file names as keys and the data 161 from the respective files as the values. 162 163 Parameters 164 ---------- 165 **dir** : *str* 166 > A string specifying the file path for the output scMKL runs. 167 168 **pattern** : *str* 169 > A regex string for filtering down to desired files. If 170 `None`, all files in the directory with the pickle file 171 extension will be added to the dictionary. 172 173 Returns 174 ------- 175 **results** : *dict* 176 > a dictionary with the file names as keys and data as values. 177 178 Examples 179 -------- 180 >>> filepath = 'scMKL_results/rna+atac/' 181 ... 182 >>> all_results = scmkl.read_files(filepath) 183 >>> all_results.keys() 184 dict_keys(['Rep_1.pkl', Rep_2.pkl, Rep_3.pkl, ...]) 185 ''' 186 # Reading all pickle files in patter is None 187 if pattern is None: 188 data = {file : np.load(f'{dir}/{file}', allow_pickle = True) 189 for file in os.listdir(dir) if '.pkl' in file} 190 191 # Reading only files matching pattern if not None 192 else: 193 pattern = repr(pattern) 194 data = {file : np.load(f'{dir}/{file}', allow_pickle = True) 195 for file in os.listdir(dir) 196 if re.fullmatch(pattern, file) is not None} 197 198 return data 199 200 201def get_metrics(results : dict | None = None, rfiles : dict | None = None, 202 include_as : bool = False) -> pd.DataFrame: 203 ''' 204 Takes either a single scMKL result or a dictionary where each 205 entry cooresponds to one result. Returns a dataframe with cols 206 ['Alpha', 'Metric', 'Value']. If `include_as == True`, another 207 col of booleans will be added to indicate whether or not the run 208 respective to that alpha was chosen as optimal via CV. If 209 `include_key == True`, another column will be added with the name 210 of the key to the respective file (only applicable with multiple 211 results). 212 213 Parameters 214 ---------- 215 **results** : *None* | *dict* 216 > A dictionary with the results of a single run from 217 `scmkl.run()`. Must be `None` if `rfiles is not None`. 218 219 **rfiles** : *None* | *dict* 220 > A dictionary of results dictionaries containing multiple 221 results from `scmkl.run()`. If `include_keys == True`, a col 222 will be added to the output pd.DataFrame with the keys as 223 values cooresponding to each row. 224 225 **include_as** : *bool* 226 > When `True`, will add a bool col to output pd.DataFrame 227 where rows with alphas cooresponding to alpha_star will be 228 `True`. 229 230 Returns 231 ------- 232 **df** : *pd.DataFrame* 233 > A pd.DataFrame containing all of the metrics present from 234 the runs input. 235 236 Examples 237 -------- 238 >>> # For a single file 239 >>> results = scmkl.run(adata) 240 >>> metrics = scmkl.get_metrics(results = results) 241 242 >>> # For multiple runs saved in a dict 243 >>> output_dir = 'scMKL_outputs/' 244 >>> rfiles = scmkl.read_files(output_dir) 245 >>> metrics = scmkl.get_metrics(rfiles) 246 ''' 247 # Checking which data is being worked with 248 multi_results = _parse_result_type(results = results, rfiles = rfiles) 249 250 # Initiating col list with minimal columns 251 cols = ['Alpha', 'Metric', 'Value'] 252 253 if include_as: 254 cols.append('Alpha Star') 255 256 if multi_results: 257 cols.append('Key') 258 df = pd.DataFrame(columns = cols) 259 for key, result in rfiles.items(): 260 cur_df = _parse_metrics(results = result, key = key, 261 include_as = include_as) 262 df = pd.concat([df, cur_df.copy()]) 263 264 else: 265 df = _parse_metrics(results = results, include_as = include_as) 266 267 return df 268 269 270def get_weights(results : dict | None = None, rfiles : dict | None = None, 271 include_as : bool = False) -> pd.DataFrame: 272 ''' 273 Takes either a single scMKL result or dictionary of results and 274 returns a pd.DataFrame with cols ['Alpha', 'Group', 275 'Kernel Weight']. If include_as == True, a fourth col will be 276 added to indicate whether or not the run respective to that alpha 277 was chosen as optimal via CV. 278 279 Parameters 280 ---------- 281 **results** : *None* | *dict* 282 > A dictionary with the results of a single run from 283 `scmkl.run()`. Must be `None` if `rfiles is not None`. 284 285 **rfiles** : *None* | *dict* 286 > A dictionary of results dictionaries containing multiple 287 results from `scmkl.run()`. If `include_keys == True`, a col 288 will be added to the output pd.DataFrame with the keys as 289 values cooresponding to each row. 290 291 **include_as** : *bool* 292 > When `True`, will add a bool col to output pd.DataFrame 293 where rows with alphas cooresponding to alpha_star will be 294 `True`. 295 296 Returns 297 ------- 298 **df** : *pd.DataFrame* 299 > A pd.DataFrame containing all of the groups from each alpha 300 and their cooresponding kernel weights. 301 302 Examples 303 -------- 304 >>> # For a single file 305 >>> results = scmkl.run(adata) 306 >>> weights = scmkl.get_weights(results = results) 307 >>> 308 >>> # For multiple runs saved in a dict 309 >>> output_dir = 'scMKL_outputs/' 310 >>> rfiles = scmkl.read_files(output_dir) 311 >>> weights = scmkl.get_weights(rfiles) 312 ''' 313 # Checking which data is being worked with 314 multi_results = _parse_result_type(results = results, rfiles = rfiles) 315 316 # Initiating col list with minimal columns 317 cols = ['Alpha', 'Group', 'Kernel Weight'] 318 319 if include_as: 320 cols.append('Alpha Star') 321 322 if multi_results: 323 cols.append('Key') 324 df = pd.DataFrame(columns = cols) 325 for key, result in rfiles.items(): 326 cur_df = _parse_weights(results = result, key = key, 327 include_as = include_as) 328 df = pd.concat([df, cur_df.copy()]) 329 330 else: 331 df = _parse_metrics(results = results, include_as = include_as) 332 333 return df 334 335 336def get_selection(weights_df, order_groups : bool) -> pd.DataFrame: 337 ''' 338 This function takes a pd.DataFrame created by 339 `scmkl.get_weights()` and returns a selection table. Selection 340 refers to how many times a group had a nonzero group weight. To 341 calculate this, a col is added indicating whether the group was 342 selected. Then, the dataframe is grouped by alpha and group. 343 Selection can then be summed returning a dataframe with cols 344 `['Alpha', 'Group', Selection]`. 345 346 Parameters 347 ---------- 348 **weights_df** : *pd.DataFrame* 349 > A dataframe output by `scmkl.get_weights()` with cols 350 `['Alpha', 'Group', 'Kernel Weight']`. 351 352 **order_groups** : *bool* 353 > If `True`, the `'Group'` col of the output dataframe will be 354 made into a `pd.Categorical` col ordered by number of times 355 each group was selected in decending order. 356 357 Returns 358 ------- 359 **df** : *pd.DataFrame* 360 > A dataframe with cols `['Alpha', 'Group', Selection]`. 361 362 Example 363 ------- 364 >>> # For a single file 365 >>> results = scmkl.run(adata) 366 >>> weights = scmkl.get_weights(results = results) 367 >>> selection = scmkl.get_selection(weights) 368 >>> 369 >>> # For multiple runs saved in a dict 370 >>> output_dir = 'scMKL_outputs/' 371 >>> rfiles = scmkl.read_files(output_dir) 372 >>> weights = scmkl.get_weights(rfiles) 373 >>> selection = scmkl.get_selection(weights) 374 ''' 375 # Adding col indicating whether or not groups have nonzero weight 376 selection = weights_df['Kernel Weight'].apply(lambda x: x > 0) 377 weights_df['Selection'] = selection 378 379 # Summing selection across replications to get selection 380 df = weights_df.groupby(['Alpha', 'Group'])['Selection'].sum() 381 df = df.reset_index() 382 383 # Getting group order 384 if order_groups: 385 order = df.groupby('Group')['Selection'].sum() 386 order = order.reset_index().sort_values(by = 'Selection', 387 ascending = False) 388 order = order['Group'] 389 df['Group'] = pd.Categorical(df['Group'], categories = order) 390 391 392 return df 393 394 395def mean_groups_per_alpha(selection_df) -> dict: 396 ''' 397 This function takes a pd.DataFrame from `scmkl.get_selection()` 398 generated from multiple scMKL results and returns a dictionary 399 with keys being alphas from the input dataframe and values being 400 the mean number of selected groups for a given alpha across 401 results. 402 403 Parameters 404 ---------- 405 **selection_df** : *pd.DataFrame* 406 > A dataframe output by `scmkl.get_selection()` with cols 407 `['Alpha', 'Group', Selection]. 408 409 Returns 410 ------- 411 **mean_groups** : *dict* 412 > A dictionary with alphas as keys and the mean number of 413 selected groups for that alpha as keys. 414 415 Examples 416 -------- 417 >>> weights = scmkl.get_weights(rfiles) 418 >>> selection = scmkl.get_selection(weights) 419 >>> mean_groups = scmkl.mean_groups_per_alpha(selection) 420 >>> mean_groups = {alpha : np.round(num_selected, 1) 421 ... for alpha, num_selected in mean_groups.items()} 422 >>> 423 >>> print(mean_groups) 424 {0.05 : 50.0, 0.2 : 24.7, 1.1 : 5.3} 425 ''' 426 mean_groups = {} 427 for alpha in np.unique(selection_df['Alpha']): 428 429 # Capturing rows for given alpha 430 rows = selection_df['Alpha'] == alpha 431 432 # Adding mean number of groups for alpha 433 mean_groups[alpha] = np.mean(selection_df[rows]['Selection']) 434 435 return mean_groups
90def get_summary(results : dict, metric = 'AUROC'): 91 ''' 92 Takes the results from either `scmkl.run()` and generates a 93 dataframe for each model containing columns for alpha, area under 94 the ROC, number of groups with nonzero weights, and highest 95 weighted group. 96 97 Parameters 98 ---------- 99 **results** : *dict* 100 > A dictionary of results from scMKL generated from either 101 `scmkl.run()`. 102 103 **metric** : *str* 104 > Which metric to include in the summary. Default is AUROC. 105 Options include `'AUROC'`, `'Recall'`, `'Precision'`, 106 `'Accuracy'`, and `'F1-Score'`. 107 108 Returns 109 ------- 110 **summary_df** : *pd.DataFrame* 111 > A table with columns: 112 `['Alpha', 'AUROC', 'Number of Selected Groups', 'Top Group']`. 113 114 Examples 115 -------- 116 >>> results = scmkl.run(adata, alpha_list) 117 >>> summary_df = scmkl.get_summary(results) 118 ... 119 >>> summary_df.head() 120 Alpha AUROC Number of Selected Groups 121 0 2.20 0.8600 3 122 1 1.96 0.9123 4 123 2 1.72 0.9357 5 124 3 1.48 0.9524 7 125 4 1.24 0.9666 9 126 Top Group 127 0 RNA-HALLMARK_E2F_TARGETS 128 1 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY 129 2 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY 130 3 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY 131 4 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY 132 ''' 133 summary = {'Alpha' : [], 134 'AUROC' : [], 135 'Number of Selected Groups' : [], 136 'Top Group' : []} 137 138 alpha_list = list(results['Metrics'].keys()) 139 140 # Creating summary DataFrame for each model 141 for alpha in alpha_list: 142 cur_alpha_rows = results['Norms'][alpha] 143 top_weight_rows = np.max(results['Norms'][alpha]) 144 top_group_index = np.where(cur_alpha_rows == top_weight_rows) 145 num_selected = len(results['Selected_groups'][alpha]) 146 top_group_names = np.array(results['Group_names'])[top_group_index] 147 148 summary['Alpha'].append(alpha) 149 summary['AUROC'].append(results['Metrics'][alpha][metric]) 150 summary['Number of Selected Groups'].append(num_selected) 151 summary['Top Group'].append(*top_group_names) 152 153 summary = pd.DataFrame(summary) 154 155 return summary
Takes the results from either scmkl.run
and generates a
dataframe for each model containing columns for alpha, area under
the ROC, number of groups with nonzero weights, and highest
weighted group.
Parameters
results : dict
A dictionary of results from scMKL generated from either
scmkl.run
.
metric : str
Which metric to include in the summary. Default is AUROC. Options include
'AUROC'
,'Recall'
,'Precision'
,'Accuracy'
, and'F1-Score'
.
Returns
summary_df : pd.DataFrame
A table with columns:
['Alpha', 'AUROC', 'Number of Selected Groups', 'Top Group']
.
Examples
>>> results = scmkl.run(adata, alpha_list)
>>> summary_df = scmkl.get_summary(results)
...
>>> summary_df.head()
Alpha AUROC Number of Selected Groups
0 2.20 0.8600 3
1 1.96 0.9123 4
2 1.72 0.9357 5
3 1.48 0.9524 7
4 1.24 0.9666 9
Top Group
0 RNA-HALLMARK_E2F_TARGETS
1 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
2 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
3 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
4 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
158def read_files(dir : str, pattern : str | None = None) -> dict: 159 ''' 160 This function takes a directory of scMKL results as pickle files 161 and returns a dictionary with the file names as keys and the data 162 from the respective files as the values. 163 164 Parameters 165 ---------- 166 **dir** : *str* 167 > A string specifying the file path for the output scMKL runs. 168 169 **pattern** : *str* 170 > A regex string for filtering down to desired files. If 171 `None`, all files in the directory with the pickle file 172 extension will be added to the dictionary. 173 174 Returns 175 ------- 176 **results** : *dict* 177 > a dictionary with the file names as keys and data as values. 178 179 Examples 180 -------- 181 >>> filepath = 'scMKL_results/rna+atac/' 182 ... 183 >>> all_results = scmkl.read_files(filepath) 184 >>> all_results.keys() 185 dict_keys(['Rep_1.pkl', Rep_2.pkl, Rep_3.pkl, ...]) 186 ''' 187 # Reading all pickle files in patter is None 188 if pattern is None: 189 data = {file : np.load(f'{dir}/{file}', allow_pickle = True) 190 for file in os.listdir(dir) if '.pkl' in file} 191 192 # Reading only files matching pattern if not None 193 else: 194 pattern = repr(pattern) 195 data = {file : np.load(f'{dir}/{file}', allow_pickle = True) 196 for file in os.listdir(dir) 197 if re.fullmatch(pattern, file) is not None} 198 199 return data
This function takes a directory of scMKL results as pickle files and returns a dictionary with the file names as keys and the data from the respective files as the values.
Parameters
dir : str
A string specifying the file path for the output scMKL runs.
pattern : str
A regex string for filtering down to desired files. If
None
, all files in the directory with the pickle file extension will be added to the dictionary.
Returns
results : dict
a dictionary with the file names as keys and data as values.
Examples
>>> filepath = 'scMKL_results/rna+atac/'
...
>>> all_results = scmkl.read_files(filepath)
>>> all_results.keys()
dict_keys(['Rep_1.pkl', Rep_2.pkl, Rep_3.pkl, ...])
202def get_metrics(results : dict | None = None, rfiles : dict | None = None, 203 include_as : bool = False) -> pd.DataFrame: 204 ''' 205 Takes either a single scMKL result or a dictionary where each 206 entry cooresponds to one result. Returns a dataframe with cols 207 ['Alpha', 'Metric', 'Value']. If `include_as == True`, another 208 col of booleans will be added to indicate whether or not the run 209 respective to that alpha was chosen as optimal via CV. If 210 `include_key == True`, another column will be added with the name 211 of the key to the respective file (only applicable with multiple 212 results). 213 214 Parameters 215 ---------- 216 **results** : *None* | *dict* 217 > A dictionary with the results of a single run from 218 `scmkl.run()`. Must be `None` if `rfiles is not None`. 219 220 **rfiles** : *None* | *dict* 221 > A dictionary of results dictionaries containing multiple 222 results from `scmkl.run()`. If `include_keys == True`, a col 223 will be added to the output pd.DataFrame with the keys as 224 values cooresponding to each row. 225 226 **include_as** : *bool* 227 > When `True`, will add a bool col to output pd.DataFrame 228 where rows with alphas cooresponding to alpha_star will be 229 `True`. 230 231 Returns 232 ------- 233 **df** : *pd.DataFrame* 234 > A pd.DataFrame containing all of the metrics present from 235 the runs input. 236 237 Examples 238 -------- 239 >>> # For a single file 240 >>> results = scmkl.run(adata) 241 >>> metrics = scmkl.get_metrics(results = results) 242 243 >>> # For multiple runs saved in a dict 244 >>> output_dir = 'scMKL_outputs/' 245 >>> rfiles = scmkl.read_files(output_dir) 246 >>> metrics = scmkl.get_metrics(rfiles) 247 ''' 248 # Checking which data is being worked with 249 multi_results = _parse_result_type(results = results, rfiles = rfiles) 250 251 # Initiating col list with minimal columns 252 cols = ['Alpha', 'Metric', 'Value'] 253 254 if include_as: 255 cols.append('Alpha Star') 256 257 if multi_results: 258 cols.append('Key') 259 df = pd.DataFrame(columns = cols) 260 for key, result in rfiles.items(): 261 cur_df = _parse_metrics(results = result, key = key, 262 include_as = include_as) 263 df = pd.concat([df, cur_df.copy()]) 264 265 else: 266 df = _parse_metrics(results = results, include_as = include_as) 267 268 return df
Takes either a single scMKL result or a dictionary where each
entry cooresponds to one result. Returns a dataframe with cols
['Alpha', 'Metric', 'Value']. If include_as == True
, another
col of booleans will be added to indicate whether or not the run
respective to that alpha was chosen as optimal via CV. If
include_key == True
, another column will be added with the name
of the key to the respective file (only applicable with multiple
results).
Parameters
results : None | dict
A dictionary with the results of a single run from
scmkl.run
. Must beNone
ifrfiles is not None
.
rfiles : None | dict
A dictionary of results dictionaries containing multiple results from
scmkl.run
. Ifinclude_keys == True
, a col will be added to the output pd.DataFrame with the keys as values cooresponding to each row.
include_as : bool
When
True
, will add a bool col to output pd.DataFrame where rows with alphas cooresponding to alpha_star will beTrue
.
Returns
df : pd.DataFrame
A pd.DataFrame containing all of the metrics present from the runs input.
Examples
>>> # For a single file
>>> results = scmkl.run(adata)
>>> metrics = scmkl.get_metrics(results = results)
>>> # For multiple runs saved in a dict
>>> output_dir = 'scMKL_outputs/'
>>> rfiles = scmkl.read_files(output_dir)
>>> metrics = scmkl.get_metrics(rfiles)
271def get_weights(results : dict | None = None, rfiles : dict | None = None, 272 include_as : bool = False) -> pd.DataFrame: 273 ''' 274 Takes either a single scMKL result or dictionary of results and 275 returns a pd.DataFrame with cols ['Alpha', 'Group', 276 'Kernel Weight']. If include_as == True, a fourth col will be 277 added to indicate whether or not the run respective to that alpha 278 was chosen as optimal via CV. 279 280 Parameters 281 ---------- 282 **results** : *None* | *dict* 283 > A dictionary with the results of a single run from 284 `scmkl.run()`. Must be `None` if `rfiles is not None`. 285 286 **rfiles** : *None* | *dict* 287 > A dictionary of results dictionaries containing multiple 288 results from `scmkl.run()`. If `include_keys == True`, a col 289 will be added to the output pd.DataFrame with the keys as 290 values cooresponding to each row. 291 292 **include_as** : *bool* 293 > When `True`, will add a bool col to output pd.DataFrame 294 where rows with alphas cooresponding to alpha_star will be 295 `True`. 296 297 Returns 298 ------- 299 **df** : *pd.DataFrame* 300 > A pd.DataFrame containing all of the groups from each alpha 301 and their cooresponding kernel weights. 302 303 Examples 304 -------- 305 >>> # For a single file 306 >>> results = scmkl.run(adata) 307 >>> weights = scmkl.get_weights(results = results) 308 >>> 309 >>> # For multiple runs saved in a dict 310 >>> output_dir = 'scMKL_outputs/' 311 >>> rfiles = scmkl.read_files(output_dir) 312 >>> weights = scmkl.get_weights(rfiles) 313 ''' 314 # Checking which data is being worked with 315 multi_results = _parse_result_type(results = results, rfiles = rfiles) 316 317 # Initiating col list with minimal columns 318 cols = ['Alpha', 'Group', 'Kernel Weight'] 319 320 if include_as: 321 cols.append('Alpha Star') 322 323 if multi_results: 324 cols.append('Key') 325 df = pd.DataFrame(columns = cols) 326 for key, result in rfiles.items(): 327 cur_df = _parse_weights(results = result, key = key, 328 include_as = include_as) 329 df = pd.concat([df, cur_df.copy()]) 330 331 else: 332 df = _parse_metrics(results = results, include_as = include_as) 333 334 return df
Takes either a single scMKL result or dictionary of results and returns a pd.DataFrame with cols ['Alpha', 'Group', 'Kernel Weight']. If include_as == True, a fourth col will be added to indicate whether or not the run respective to that alpha was chosen as optimal via CV.
Parameters
results : None | dict
A dictionary with the results of a single run from
scmkl.run
. Must beNone
ifrfiles is not None
.
rfiles : None | dict
A dictionary of results dictionaries containing multiple results from
scmkl.run
. Ifinclude_keys == True
, a col will be added to the output pd.DataFrame with the keys as values cooresponding to each row.
include_as : bool
When
True
, will add a bool col to output pd.DataFrame where rows with alphas cooresponding to alpha_star will beTrue
.
Returns
df : pd.DataFrame
A pd.DataFrame containing all of the groups from each alpha and their cooresponding kernel weights.
Examples
>>> # For a single file
>>> results = scmkl.run(adata)
>>> weights = scmkl.get_weights(results = results)
>>>
>>> # For multiple runs saved in a dict
>>> output_dir = 'scMKL_outputs/'
>>> rfiles = scmkl.read_files(output_dir)
>>> weights = scmkl.get_weights(rfiles)
337def get_selection(weights_df, order_groups : bool) -> pd.DataFrame: 338 ''' 339 This function takes a pd.DataFrame created by 340 `scmkl.get_weights()` and returns a selection table. Selection 341 refers to how many times a group had a nonzero group weight. To 342 calculate this, a col is added indicating whether the group was 343 selected. Then, the dataframe is grouped by alpha and group. 344 Selection can then be summed returning a dataframe with cols 345 `['Alpha', 'Group', Selection]`. 346 347 Parameters 348 ---------- 349 **weights_df** : *pd.DataFrame* 350 > A dataframe output by `scmkl.get_weights()` with cols 351 `['Alpha', 'Group', 'Kernel Weight']`. 352 353 **order_groups** : *bool* 354 > If `True`, the `'Group'` col of the output dataframe will be 355 made into a `pd.Categorical` col ordered by number of times 356 each group was selected in decending order. 357 358 Returns 359 ------- 360 **df** : *pd.DataFrame* 361 > A dataframe with cols `['Alpha', 'Group', Selection]`. 362 363 Example 364 ------- 365 >>> # For a single file 366 >>> results = scmkl.run(adata) 367 >>> weights = scmkl.get_weights(results = results) 368 >>> selection = scmkl.get_selection(weights) 369 >>> 370 >>> # For multiple runs saved in a dict 371 >>> output_dir = 'scMKL_outputs/' 372 >>> rfiles = scmkl.read_files(output_dir) 373 >>> weights = scmkl.get_weights(rfiles) 374 >>> selection = scmkl.get_selection(weights) 375 ''' 376 # Adding col indicating whether or not groups have nonzero weight 377 selection = weights_df['Kernel Weight'].apply(lambda x: x > 0) 378 weights_df['Selection'] = selection 379 380 # Summing selection across replications to get selection 381 df = weights_df.groupby(['Alpha', 'Group'])['Selection'].sum() 382 df = df.reset_index() 383 384 # Getting group order 385 if order_groups: 386 order = df.groupby('Group')['Selection'].sum() 387 order = order.reset_index().sort_values(by = 'Selection', 388 ascending = False) 389 order = order['Group'] 390 df['Group'] = pd.Categorical(df['Group'], categories = order) 391 392 393 return df
This function takes a pd.DataFrame created by
scmkl.get_weights()
and returns a selection table. Selection
refers to how many times a group had a nonzero group weight. To
calculate this, a col is added indicating whether the group was
selected. Then, the dataframe is grouped by alpha and group.
Selection can then be summed returning a dataframe with cols
['Alpha', 'Group', Selection]
.
Parameters
weights_df : pd.DataFrame
A dataframe output by
scmkl.get_weights()
with cols['Alpha', 'Group', 'Kernel Weight']
.
order_groups : bool
If
True
, the'Group'
col of the output dataframe will be made into apd.Categorical
col ordered by number of times each group was selected in decending order.
Returns
df : pd.DataFrame
A dataframe with cols
['Alpha', 'Group', Selection]
.
Example
>>> # For a single file
>>> results = scmkl.run(adata)
>>> weights = scmkl.get_weights(results = results)
>>> selection = scmkl.get_selection(weights)
>>>
>>> # For multiple runs saved in a dict
>>> output_dir = 'scMKL_outputs/'
>>> rfiles = scmkl.read_files(output_dir)
>>> weights = scmkl.get_weights(rfiles)
>>> selection = scmkl.get_selection(weights)
396def mean_groups_per_alpha(selection_df) -> dict: 397 ''' 398 This function takes a pd.DataFrame from `scmkl.get_selection()` 399 generated from multiple scMKL results and returns a dictionary 400 with keys being alphas from the input dataframe and values being 401 the mean number of selected groups for a given alpha across 402 results. 403 404 Parameters 405 ---------- 406 **selection_df** : *pd.DataFrame* 407 > A dataframe output by `scmkl.get_selection()` with cols 408 `['Alpha', 'Group', Selection]. 409 410 Returns 411 ------- 412 **mean_groups** : *dict* 413 > A dictionary with alphas as keys and the mean number of 414 selected groups for that alpha as keys. 415 416 Examples 417 -------- 418 >>> weights = scmkl.get_weights(rfiles) 419 >>> selection = scmkl.get_selection(weights) 420 >>> mean_groups = scmkl.mean_groups_per_alpha(selection) 421 >>> mean_groups = {alpha : np.round(num_selected, 1) 422 ... for alpha, num_selected in mean_groups.items()} 423 >>> 424 >>> print(mean_groups) 425 {0.05 : 50.0, 0.2 : 24.7, 1.1 : 5.3} 426 ''' 427 mean_groups = {} 428 for alpha in np.unique(selection_df['Alpha']): 429 430 # Capturing rows for given alpha 431 rows = selection_df['Alpha'] == alpha 432 433 # Adding mean number of groups for alpha 434 mean_groups[alpha] = np.mean(selection_df[rows]['Selection']) 435 436 return mean_groups
This function takes a pd.DataFrame from scmkl.get_selection()
generated from multiple scMKL results and returns a dictionary
with keys being alphas from the input dataframe and values being
the mean number of selected groups for a given alpha across
results.
Parameters
selection_df : pd.DataFrame
A dataframe output by
scmkl.get_selection()
with cols `['Alpha', 'Group', Selection].
Returns
mean_groups : dict
A dictionary with alphas as keys and the mean number of selected groups for that alpha as keys.
Examples
>>> weights = scmkl.get_weights(rfiles)
>>> selection = scmkl.get_selection(weights)
>>> mean_groups = scmkl.mean_groups_per_alpha(selection)
>>> mean_groups = {alpha : np.round(num_selected, 1)
... for alpha, num_selected in mean_groups.items()}
>>>
>>> print(mean_groups)
{0.05 : 50.0, 0.2 : 24.7, 1.1 : 5.3}