scmkl.dataframes
1import os 2import re 3import numpy as np 4import pandas as pd 5 6 7def _parse_result_type(results : dict | None, rfiles : dict | None) -> bool: 8 ''' 9 This function simply returns a bool for whether or not there are 10 multiple runs present while checking that There is one dict and 11 one Nonetype between `results` and `rfiles`. 12 ''' 13 dtypes = (type(results), type(rfiles)) 14 none_in_dtypes = type(None) in dtypes 15 dict_in_dtypes = dict in dtypes 16 both_in_dtypes = none_in_dtypes and dict_in_dtypes 17 18 # Ensuring that at least one of dtypes is None 19 assert both_in_dtypes, "Only `rfiles` or `results` can be provided" 20 21 if type(rfiles) is dict: 22 mult_files = True 23 else: 24 mult_files = False 25 26 return mult_files 27 28 29def _parse_metrics(results, key : str | None = None, 30 include_as = False) -> pd.DataFrame: 31 ''' 32 This function returns a pd.DataFrame for a single scMKL result. 33 ''' 34 alpha_vals = [] 35 met_names = [] 36 met_vals = [] 37 38 # If statement ensuring results is a scMKL results with metrics 39 if 'Metrics' in results.keys(): 40 for alpha in results['Metrics'].keys(): 41 for metric, value in results['Metrics'][alpha].items(): 42 alpha_vals.append(alpha) 43 met_names.append(metric) 44 met_vals.append(value) 45 46 # Fix this for include_as parameter 47 else: 48 print(f"{key} is not a scMKL result and will be ignored.") 49 50 df = pd.DataFrame({'Alpha' : alpha_vals, 51 'Metric' : met_names, 52 'Value' : met_vals}) 53 54 if include_as: 55 assert 'Alpha_star' in results.keys(), "'Alpha_star' not in results" 56 df['Alpha Star'] = df['Alpha'] == results['Alpha_star'] 57 58 if key is not None: 59 df['Key'] = [key] * df.shape[0] 60 61 return df 62 63 64def _parse_weights(results : dict, include_as : bool = False, 65 key : None | str = None) -> pd.DataFrame: 66 ''' 67 ''' 68 alpha_vals = [] 69 group_names = [] 70 kernel_weights = [] 71 72 for alpha in results['Norms'].keys(): 73 alpha_vals.extend([alpha] * len(results['Norms'][alpha])) 74 group_names.extend(results['Group_names']) 75 kernel_weights.extend(results['Norms'][alpha]) 76 77 df = pd.DataFrame({'Alpha' : alpha_vals, 78 'Group' : group_names, 79 'Kernel Weight' : kernel_weights}) 80 81 if include_as: 82 df['Alpha Star'] = df['Alpha'] == results['Alpha_star'] 83 84 if key is not None: 85 df['Key'] = [key] * df.shape[0] 86 87 return df 88 89 90def get_summary(results : dict, metric = 'AUROC'): 91 ''' 92 Takes the results from either `scmkl.run()` and generates a 93 dataframe for each model containing columns for alpha, area under 94 the ROC, number of groups with nonzero weights, and highest 95 weighted group. 96 97 Parameters 98 ---------- 99 **results** : *dict* 100 > A dictionary of results from scMKL generated from either 101 `scmkl.run()`. 102 103 **metric** : *str* 104 > Which metric to include in the summary. Default is AUROC. 105 Options include `'AUROC'`, `'Recall'`, `'Precision'`, 106 `'Accuracy'`, and `'F1-Score'`. 107 108 Returns 109 ------- 110 **summary_df** : *pd.DataFrame* 111 > A table with columns: 112 `['Alpha', 'AUROC', 'Number of Selected Groups', 'Top Group']`. 113 114 Examples 115 -------- 116 >>> results = scmkl.run(adata, alpha_list) 117 >>> summary_df = scmkl.get_summary(results) 118 ... 119 >>> summary_df.head() 120 Alpha AUROC Number of Selected Groups 121 0 2.20 0.8600 3 122 1 1.96 0.9123 4 123 2 1.72 0.9357 5 124 3 1.48 0.9524 7 125 4 1.24 0.9666 9 126 Top Group 127 0 RNA-HALLMARK_E2F_TARGETS 128 1 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY 129 2 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY 130 3 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY 131 4 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY 132 ''' 133 summary = {'Alpha' : [], 134 'AUROC' : [], 135 'Number of Selected Groups' : [], 136 'Top Group' : []} 137 138 alpha_list = list(results['Metrics'].keys()) 139 140 # Creating summary DataFrame for each model 141 for alpha in alpha_list: 142 cur_alpha_rows = results['Norms'][alpha] 143 top_weight_rows = np.max(results['Norms'][alpha]) 144 top_group_index = np.where(cur_alpha_rows == top_weight_rows) 145 num_selected = len(results['Selected_groups'][alpha]) 146 top_group_names = np.array(results['Group_names'])[top_group_index] 147 148 summary['Alpha'].append(alpha) 149 summary[metric].append(results['Metrics'][alpha][metric]) 150 summary['Number of Selected Groups'].append(num_selected) 151 summary['Top Group'].append(*top_group_names) 152 153 summary = pd.DataFrame(summary) 154 155 return summary 156 157 158def read_files(dir : str, pattern : str | None = None) -> dict: 159 ''' 160 This function takes a directory of scMKL results as pickle files 161 and returns a dictionary with the file names as keys and the data 162 from the respective files as the values. 163 164 Parameters 165 ---------- 166 **dir** : *str* 167 > A string specifying the file path for the output scMKL runs. 168 169 **pattern** : *str* 170 > A regex string for filtering down to desired files. If 171 `None`, all files in the directory with the pickle file 172 extension will be added to the dictionary. 173 174 Returns 175 ------- 176 **results** : *dict* 177 > a dictionary with the file names as keys and data as values. 178 179 Examples 180 -------- 181 >>> filepath = 'scMKL_results/rna+atac/' 182 ... 183 >>> all_results = scmkl.read_files(filepath) 184 >>> all_results.keys() 185 dict_keys(['Rep_1.pkl', Rep_2.pkl, Rep_3.pkl, ...]) 186 ''' 187 # Reading all pickle files in patter is None 188 if pattern is None: 189 data = {file : np.load(f'{dir}/{file}', allow_pickle = True) 190 for file in os.listdir(dir) if '.pkl' in file} 191 192 # Reading only files matching pattern if not None 193 else: 194 pattern = repr(pattern) 195 data = {file : np.load(f'{dir}/{file}', allow_pickle = True) 196 for file in os.listdir(dir) 197 if re.fullmatch(pattern, file) is not None} 198 199 return data 200 201 202def get_metrics(results : dict | None = None, rfiles : dict | None = None, 203 include_as : bool = False) -> pd.DataFrame: 204 ''' 205 Takes either a single scMKL result or a dictionary where each 206 entry cooresponds to one result. Returns a dataframe with cols 207 ['Alpha', 'Metric', 'Value']. If `include_as == True`, another 208 col of booleans will be added to indicate whether or not the run 209 respective to that alpha was chosen as optimal via CV. If 210 `include_key == True`, another column will be added with the name 211 of the key to the respective file (only applicable with multiple 212 results). 213 214 Parameters 215 ---------- 216 **results** : *None* | *dict* 217 > A dictionary with the results of a single run from 218 `scmkl.run()`. Must be `None` if `rfiles is not None`. 219 220 **rfiles** : *None* | *dict* 221 > A dictionary of results dictionaries containing multiple 222 results from `scmkl.run()`. If `include_keys == True`, a col 223 will be added to the output pd.DataFrame with the keys as 224 values cooresponding to each row. 225 226 **include_as** : *bool* 227 > When `True`, will add a bool col to output pd.DataFrame 228 where rows with alphas cooresponding to alpha_star will be 229 `True`. 230 231 Returns 232 ------- 233 **df** : *pd.DataFrame* 234 > A pd.DataFrame containing all of the metrics present from 235 the runs input. 236 237 Examples 238 -------- 239 >>> # For a single file 240 >>> results = scmkl.run(adata) 241 >>> metrics = scmkl.get_metrics(results = results) 242 243 >>> # For multiple runs saved in a dict 244 >>> output_dir = 'scMKL_outputs/' 245 >>> rfiles = scmkl.read_files(output_dir) 246 >>> metrics = scmkl.get_metrics(rfiles) 247 ''' 248 # Checking which data is being worked with 249 multi_results = _parse_result_type(results = results, rfiles = rfiles) 250 251 # Initiating col list with minimal columns 252 cols = ['Alpha', 'Metric', 'Value'] 253 254 if include_as: 255 cols.append('Alpha Star') 256 257 if multi_results: 258 cols.append('Key') 259 df = pd.DataFrame(columns = cols) 260 for key, result in rfiles.items(): 261 cur_df = _parse_metrics(results = result, key = key, 262 include_as = include_as) 263 df = pd.concat([df, cur_df.copy()]) 264 265 else: 266 df = _parse_metrics(results = results, include_as = include_as) 267 268 return df 269 270 271def get_weights(results : dict | None = None, rfiles : dict | None = None, 272 include_as : bool = False) -> pd.DataFrame: 273 ''' 274 Takes either a single scMKL result or dictionary of results and 275 returns a pd.DataFrame with cols ['Alpha', 'Group', 276 'Kernel Weight']. If include_as == True, a fourth col will be 277 added to indicate whether or not the run respective to that alpha 278 was chosen as optimal via CV. 279 280 Parameters 281 ---------- 282 **results** : *None* | *dict* 283 > A dictionary with the results of a single run from 284 `scmkl.run()`. Must be `None` if `rfiles is not None`. 285 286 **rfiles** : *None* | *dict* 287 > A dictionary of results dictionaries containing multiple 288 results from `scmkl.run()`. If `include_keys == True`, a col 289 will be added to the output pd.DataFrame with the keys as 290 values cooresponding to each row. 291 292 **include_as** : *bool* 293 > When `True`, will add a bool col to output pd.DataFrame 294 where rows with alphas cooresponding to alpha_star will be 295 `True`. 296 297 Returns 298 ------- 299 **df** : *pd.DataFrame* 300 > A pd.DataFrame containing all of the groups from each alpha 301 and their cooresponding kernel weights. 302 303 Examples 304 -------- 305 >>> # For a single file 306 >>> results = scmkl.run(adata) 307 >>> weights = scmkl.get_weights(results = results) 308 >>> 309 >>> # For multiple runs saved in a dict 310 >>> output_dir = 'scMKL_outputs/' 311 >>> rfiles = scmkl.read_files(output_dir) 312 >>> weights = scmkl.get_weights(rfiles) 313 ''' 314 # Checking which data is being worked with 315 multi_results = _parse_result_type(results = results, rfiles = rfiles) 316 317 # Initiating col list with minimal columns 318 cols = ['Alpha', 'Group', 'Kernel Weight'] 319 320 if include_as: 321 cols.append('Alpha Star') 322 323 if multi_results: 324 cols.append('Key') 325 df = pd.DataFrame(columns = cols) 326 for key, result in rfiles.items(): 327 cur_df = _parse_weights(results = result, key = key, 328 include_as = include_as) 329 df = pd.concat([df, cur_df.copy()]) 330 331 else: 332 df = _parse_metrics(results = results, include_as = include_as) 333 334 return df 335 336 337def get_selection(weights_df, order_groups : bool) -> pd.DataFrame: 338 ''' 339 This function takes a pd.DataFrame created by 340 `scmkl.get_weights()` and returns a selection table. Selection 341 refers to how many times a group had a nonzero group weight. To 342 calculate this, a col is added indicating whether the group was 343 selected. Then, the dataframe is grouped by alpha and group. 344 Selection can then be summed returning a dataframe with cols 345 `['Alpha', 'Group', Selection]`. 346 347 Parameters 348 ---------- 349 **weights_df** : *pd.DataFrame* 350 > A dataframe output by `scmkl.get_weights()` with cols 351 `['Alpha', 'Group', 'Kernel Weight']`. 352 353 **order_groups** : *bool* 354 > If `True`, the `'Group'` col of the output dataframe will be 355 made into a `pd.Categorical` col ordered by number of times 356 each group was selected in decending order. 357 358 Returns 359 ------- 360 **df** : *pd.DataFrame* 361 > A dataframe with cols `['Alpha', 'Group', Selection]`. 362 363 Example 364 ------- 365 >>> # For a single file 366 >>> results = scmkl.run(adata) 367 >>> weights = scmkl.get_weights(results = results) 368 >>> selection = scmkl.get_selection(weights) 369 >>> 370 >>> # For multiple runs saved in a dict 371 >>> output_dir = 'scMKL_outputs/' 372 >>> rfiles = scmkl.read_files(output_dir) 373 >>> weights = scmkl.get_weights(rfiles) 374 >>> selection = scmkl.get_selection(weights) 375 ''' 376 # Adding col indicating whether or not groups have nonzero weight 377 selection = weights_df['Kernel Weight'].apply(lambda x: x > 0) 378 weights_df['Selection'] = selection 379 380 # Summing selection across replications to get selection 381 df = weights_df.groupby(['Alpha', 'Group'])['Selection'].sum() 382 df = df.reset_index() 383 384 # Getting group order 385 if order_groups: 386 order = df.groupby('Group')['Selection'].sum() 387 order = order.reset_index().sort_values(by = 'Selection', 388 ascending = False) 389 order = order['Group'] 390 df['Group'] = pd.Categorical(df['Group'], categories = order) 391 392 393 return df 394 395 396def mean_groups_per_alpha(selection_df) -> dict: 397 ''' 398 This function takes a pd.DataFrame from `scmkl.get_selection()` 399 generated from multiple scMKL results and returns a dictionary 400 with keys being alphas from the input dataframe and values being 401 the mean number of selected groups for a given alpha across 402 results. 403 404 Parameters 405 ---------- 406 **selection_df** : *pd.DataFrame* 407 > A dataframe output by `scmkl.get_selection()` with cols 408 `['Alpha', 'Group', Selection]. 409 410 Returns 411 ------- 412 **mean_groups** : *dict* 413 > A dictionary with alphas as keys and the mean number of 414 selected groups for that alpha as keys. 415 416 Examples 417 -------- 418 >>> weights = scmkl.get_weights(rfiles) 419 >>> selection = scmkl.get_selection(weights) 420 >>> mean_groups = scmkl.mean_groups_per_alpha(selection) 421 >>> mean_groups = {alpha : np.round(num_selected, 1) 422 ... for alpha, num_selected in mean_groups.items()} 423 >>> 424 >>> print(mean_groups) 425 {0.05 : 50.0, 0.2 : 24.7, 1.1 : 5.3} 426 ''' 427 mean_groups = {} 428 for alpha in np.unique(selection_df['Alpha']): 429 430 # Capturing rows for given alpha 431 rows = selection_df['Alpha'] == alpha 432 433 # Adding mean number of groups for alpha 434 mean_groups[alpha] = np.mean(selection_df[rows]['Selection']) 435 436 return mean_groups
91def get_summary(results : dict, metric = 'AUROC'): 92 ''' 93 Takes the results from either `scmkl.run()` and generates a 94 dataframe for each model containing columns for alpha, area under 95 the ROC, number of groups with nonzero weights, and highest 96 weighted group. 97 98 Parameters 99 ---------- 100 **results** : *dict* 101 > A dictionary of results from scMKL generated from either 102 `scmkl.run()`. 103 104 **metric** : *str* 105 > Which metric to include in the summary. Default is AUROC. 106 Options include `'AUROC'`, `'Recall'`, `'Precision'`, 107 `'Accuracy'`, and `'F1-Score'`. 108 109 Returns 110 ------- 111 **summary_df** : *pd.DataFrame* 112 > A table with columns: 113 `['Alpha', 'AUROC', 'Number of Selected Groups', 'Top Group']`. 114 115 Examples 116 -------- 117 >>> results = scmkl.run(adata, alpha_list) 118 >>> summary_df = scmkl.get_summary(results) 119 ... 120 >>> summary_df.head() 121 Alpha AUROC Number of Selected Groups 122 0 2.20 0.8600 3 123 1 1.96 0.9123 4 124 2 1.72 0.9357 5 125 3 1.48 0.9524 7 126 4 1.24 0.9666 9 127 Top Group 128 0 RNA-HALLMARK_E2F_TARGETS 129 1 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY 130 2 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY 131 3 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY 132 4 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY 133 ''' 134 summary = {'Alpha' : [], 135 'AUROC' : [], 136 'Number of Selected Groups' : [], 137 'Top Group' : []} 138 139 alpha_list = list(results['Metrics'].keys()) 140 141 # Creating summary DataFrame for each model 142 for alpha in alpha_list: 143 cur_alpha_rows = results['Norms'][alpha] 144 top_weight_rows = np.max(results['Norms'][alpha]) 145 top_group_index = np.where(cur_alpha_rows == top_weight_rows) 146 num_selected = len(results['Selected_groups'][alpha]) 147 top_group_names = np.array(results['Group_names'])[top_group_index] 148 149 summary['Alpha'].append(alpha) 150 summary[metric].append(results['Metrics'][alpha][metric]) 151 summary['Number of Selected Groups'].append(num_selected) 152 summary['Top Group'].append(*top_group_names) 153 154 summary = pd.DataFrame(summary) 155 156 return summary
Takes the results from either scmkl.run
and generates a
dataframe for each model containing columns for alpha, area under
the ROC, number of groups with nonzero weights, and highest
weighted group.
Parameters
results : dict
A dictionary of results from scMKL generated from either
scmkl.run
.
metric : str
Which metric to include in the summary. Default is AUROC. Options include
'AUROC'
,'Recall'
,'Precision'
,'Accuracy'
, and'F1-Score'
.
Returns
summary_df : pd.DataFrame
A table with columns:
['Alpha', 'AUROC', 'Number of Selected Groups', 'Top Group']
.
Examples
>>> results = scmkl.run(adata, alpha_list)
>>> summary_df = scmkl.get_summary(results)
...
>>> summary_df.head()
Alpha AUROC Number of Selected Groups
0 2.20 0.8600 3
1 1.96 0.9123 4
2 1.72 0.9357 5
3 1.48 0.9524 7
4 1.24 0.9666 9
Top Group
0 RNA-HALLMARK_E2F_TARGETS
1 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
2 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
3 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
4 RNA-HALLMARK_ESTROGEN_RESPONSE_EARLY
159def read_files(dir : str, pattern : str | None = None) -> dict: 160 ''' 161 This function takes a directory of scMKL results as pickle files 162 and returns a dictionary with the file names as keys and the data 163 from the respective files as the values. 164 165 Parameters 166 ---------- 167 **dir** : *str* 168 > A string specifying the file path for the output scMKL runs. 169 170 **pattern** : *str* 171 > A regex string for filtering down to desired files. If 172 `None`, all files in the directory with the pickle file 173 extension will be added to the dictionary. 174 175 Returns 176 ------- 177 **results** : *dict* 178 > a dictionary with the file names as keys and data as values. 179 180 Examples 181 -------- 182 >>> filepath = 'scMKL_results/rna+atac/' 183 ... 184 >>> all_results = scmkl.read_files(filepath) 185 >>> all_results.keys() 186 dict_keys(['Rep_1.pkl', Rep_2.pkl, Rep_3.pkl, ...]) 187 ''' 188 # Reading all pickle files in patter is None 189 if pattern is None: 190 data = {file : np.load(f'{dir}/{file}', allow_pickle = True) 191 for file in os.listdir(dir) if '.pkl' in file} 192 193 # Reading only files matching pattern if not None 194 else: 195 pattern = repr(pattern) 196 data = {file : np.load(f'{dir}/{file}', allow_pickle = True) 197 for file in os.listdir(dir) 198 if re.fullmatch(pattern, file) is not None} 199 200 return data
This function takes a directory of scMKL results as pickle files and returns a dictionary with the file names as keys and the data from the respective files as the values.
Parameters
dir : str
A string specifying the file path for the output scMKL runs.
pattern : str
A regex string for filtering down to desired files. If
None
, all files in the directory with the pickle file extension will be added to the dictionary.
Returns
results : dict
a dictionary with the file names as keys and data as values.
Examples
>>> filepath = 'scMKL_results/rna+atac/'
...
>>> all_results = scmkl.read_files(filepath)
>>> all_results.keys()
dict_keys(['Rep_1.pkl', Rep_2.pkl, Rep_3.pkl, ...])
203def get_metrics(results : dict | None = None, rfiles : dict | None = None, 204 include_as : bool = False) -> pd.DataFrame: 205 ''' 206 Takes either a single scMKL result or a dictionary where each 207 entry cooresponds to one result. Returns a dataframe with cols 208 ['Alpha', 'Metric', 'Value']. If `include_as == True`, another 209 col of booleans will be added to indicate whether or not the run 210 respective to that alpha was chosen as optimal via CV. If 211 `include_key == True`, another column will be added with the name 212 of the key to the respective file (only applicable with multiple 213 results). 214 215 Parameters 216 ---------- 217 **results** : *None* | *dict* 218 > A dictionary with the results of a single run from 219 `scmkl.run()`. Must be `None` if `rfiles is not None`. 220 221 **rfiles** : *None* | *dict* 222 > A dictionary of results dictionaries containing multiple 223 results from `scmkl.run()`. If `include_keys == True`, a col 224 will be added to the output pd.DataFrame with the keys as 225 values cooresponding to each row. 226 227 **include_as** : *bool* 228 > When `True`, will add a bool col to output pd.DataFrame 229 where rows with alphas cooresponding to alpha_star will be 230 `True`. 231 232 Returns 233 ------- 234 **df** : *pd.DataFrame* 235 > A pd.DataFrame containing all of the metrics present from 236 the runs input. 237 238 Examples 239 -------- 240 >>> # For a single file 241 >>> results = scmkl.run(adata) 242 >>> metrics = scmkl.get_metrics(results = results) 243 244 >>> # For multiple runs saved in a dict 245 >>> output_dir = 'scMKL_outputs/' 246 >>> rfiles = scmkl.read_files(output_dir) 247 >>> metrics = scmkl.get_metrics(rfiles) 248 ''' 249 # Checking which data is being worked with 250 multi_results = _parse_result_type(results = results, rfiles = rfiles) 251 252 # Initiating col list with minimal columns 253 cols = ['Alpha', 'Metric', 'Value'] 254 255 if include_as: 256 cols.append('Alpha Star') 257 258 if multi_results: 259 cols.append('Key') 260 df = pd.DataFrame(columns = cols) 261 for key, result in rfiles.items(): 262 cur_df = _parse_metrics(results = result, key = key, 263 include_as = include_as) 264 df = pd.concat([df, cur_df.copy()]) 265 266 else: 267 df = _parse_metrics(results = results, include_as = include_as) 268 269 return df
Takes either a single scMKL result or a dictionary where each
entry cooresponds to one result. Returns a dataframe with cols
['Alpha', 'Metric', 'Value']. If include_as == True
, another
col of booleans will be added to indicate whether or not the run
respective to that alpha was chosen as optimal via CV. If
include_key == True
, another column will be added with the name
of the key to the respective file (only applicable with multiple
results).
Parameters
results : None | dict
A dictionary with the results of a single run from
scmkl.run
. Must beNone
ifrfiles is not None
.
rfiles : None | dict
A dictionary of results dictionaries containing multiple results from
scmkl.run
. Ifinclude_keys == True
, a col will be added to the output pd.DataFrame with the keys as values cooresponding to each row.
include_as : bool
When
True
, will add a bool col to output pd.DataFrame where rows with alphas cooresponding to alpha_star will beTrue
.
Returns
df : pd.DataFrame
A pd.DataFrame containing all of the metrics present from the runs input.
Examples
>>> # For a single file
>>> results = scmkl.run(adata)
>>> metrics = scmkl.get_metrics(results = results)
>>> # For multiple runs saved in a dict
>>> output_dir = 'scMKL_outputs/'
>>> rfiles = scmkl.read_files(output_dir)
>>> metrics = scmkl.get_metrics(rfiles)
272def get_weights(results : dict | None = None, rfiles : dict | None = None, 273 include_as : bool = False) -> pd.DataFrame: 274 ''' 275 Takes either a single scMKL result or dictionary of results and 276 returns a pd.DataFrame with cols ['Alpha', 'Group', 277 'Kernel Weight']. If include_as == True, a fourth col will be 278 added to indicate whether or not the run respective to that alpha 279 was chosen as optimal via CV. 280 281 Parameters 282 ---------- 283 **results** : *None* | *dict* 284 > A dictionary with the results of a single run from 285 `scmkl.run()`. Must be `None` if `rfiles is not None`. 286 287 **rfiles** : *None* | *dict* 288 > A dictionary of results dictionaries containing multiple 289 results from `scmkl.run()`. If `include_keys == True`, a col 290 will be added to the output pd.DataFrame with the keys as 291 values cooresponding to each row. 292 293 **include_as** : *bool* 294 > When `True`, will add a bool col to output pd.DataFrame 295 where rows with alphas cooresponding to alpha_star will be 296 `True`. 297 298 Returns 299 ------- 300 **df** : *pd.DataFrame* 301 > A pd.DataFrame containing all of the groups from each alpha 302 and their cooresponding kernel weights. 303 304 Examples 305 -------- 306 >>> # For a single file 307 >>> results = scmkl.run(adata) 308 >>> weights = scmkl.get_weights(results = results) 309 >>> 310 >>> # For multiple runs saved in a dict 311 >>> output_dir = 'scMKL_outputs/' 312 >>> rfiles = scmkl.read_files(output_dir) 313 >>> weights = scmkl.get_weights(rfiles) 314 ''' 315 # Checking which data is being worked with 316 multi_results = _parse_result_type(results = results, rfiles = rfiles) 317 318 # Initiating col list with minimal columns 319 cols = ['Alpha', 'Group', 'Kernel Weight'] 320 321 if include_as: 322 cols.append('Alpha Star') 323 324 if multi_results: 325 cols.append('Key') 326 df = pd.DataFrame(columns = cols) 327 for key, result in rfiles.items(): 328 cur_df = _parse_weights(results = result, key = key, 329 include_as = include_as) 330 df = pd.concat([df, cur_df.copy()]) 331 332 else: 333 df = _parse_metrics(results = results, include_as = include_as) 334 335 return df
Takes either a single scMKL result or dictionary of results and returns a pd.DataFrame with cols ['Alpha', 'Group', 'Kernel Weight']. If include_as == True, a fourth col will be added to indicate whether or not the run respective to that alpha was chosen as optimal via CV.
Parameters
results : None | dict
A dictionary with the results of a single run from
scmkl.run
. Must beNone
ifrfiles is not None
.
rfiles : None | dict
A dictionary of results dictionaries containing multiple results from
scmkl.run
. Ifinclude_keys == True
, a col will be added to the output pd.DataFrame with the keys as values cooresponding to each row.
include_as : bool
When
True
, will add a bool col to output pd.DataFrame where rows with alphas cooresponding to alpha_star will beTrue
.
Returns
df : pd.DataFrame
A pd.DataFrame containing all of the groups from each alpha and their cooresponding kernel weights.
Examples
>>> # For a single file
>>> results = scmkl.run(adata)
>>> weights = scmkl.get_weights(results = results)
>>>
>>> # For multiple runs saved in a dict
>>> output_dir = 'scMKL_outputs/'
>>> rfiles = scmkl.read_files(output_dir)
>>> weights = scmkl.get_weights(rfiles)
338def get_selection(weights_df, order_groups : bool) -> pd.DataFrame: 339 ''' 340 This function takes a pd.DataFrame created by 341 `scmkl.get_weights()` and returns a selection table. Selection 342 refers to how many times a group had a nonzero group weight. To 343 calculate this, a col is added indicating whether the group was 344 selected. Then, the dataframe is grouped by alpha and group. 345 Selection can then be summed returning a dataframe with cols 346 `['Alpha', 'Group', Selection]`. 347 348 Parameters 349 ---------- 350 **weights_df** : *pd.DataFrame* 351 > A dataframe output by `scmkl.get_weights()` with cols 352 `['Alpha', 'Group', 'Kernel Weight']`. 353 354 **order_groups** : *bool* 355 > If `True`, the `'Group'` col of the output dataframe will be 356 made into a `pd.Categorical` col ordered by number of times 357 each group was selected in decending order. 358 359 Returns 360 ------- 361 **df** : *pd.DataFrame* 362 > A dataframe with cols `['Alpha', 'Group', Selection]`. 363 364 Example 365 ------- 366 >>> # For a single file 367 >>> results = scmkl.run(adata) 368 >>> weights = scmkl.get_weights(results = results) 369 >>> selection = scmkl.get_selection(weights) 370 >>> 371 >>> # For multiple runs saved in a dict 372 >>> output_dir = 'scMKL_outputs/' 373 >>> rfiles = scmkl.read_files(output_dir) 374 >>> weights = scmkl.get_weights(rfiles) 375 >>> selection = scmkl.get_selection(weights) 376 ''' 377 # Adding col indicating whether or not groups have nonzero weight 378 selection = weights_df['Kernel Weight'].apply(lambda x: x > 0) 379 weights_df['Selection'] = selection 380 381 # Summing selection across replications to get selection 382 df = weights_df.groupby(['Alpha', 'Group'])['Selection'].sum() 383 df = df.reset_index() 384 385 # Getting group order 386 if order_groups: 387 order = df.groupby('Group')['Selection'].sum() 388 order = order.reset_index().sort_values(by = 'Selection', 389 ascending = False) 390 order = order['Group'] 391 df['Group'] = pd.Categorical(df['Group'], categories = order) 392 393 394 return df
This function takes a pd.DataFrame created by
scmkl.get_weights()
and returns a selection table. Selection
refers to how many times a group had a nonzero group weight. To
calculate this, a col is added indicating whether the group was
selected. Then, the dataframe is grouped by alpha and group.
Selection can then be summed returning a dataframe with cols
['Alpha', 'Group', Selection]
.
Parameters
weights_df : pd.DataFrame
A dataframe output by
scmkl.get_weights()
with cols['Alpha', 'Group', 'Kernel Weight']
.
order_groups : bool
If
True
, the'Group'
col of the output dataframe will be made into apd.Categorical
col ordered by number of times each group was selected in decending order.
Returns
df : pd.DataFrame
A dataframe with cols
['Alpha', 'Group', Selection]
.
Example
>>> # For a single file
>>> results = scmkl.run(adata)
>>> weights = scmkl.get_weights(results = results)
>>> selection = scmkl.get_selection(weights)
>>>
>>> # For multiple runs saved in a dict
>>> output_dir = 'scMKL_outputs/'
>>> rfiles = scmkl.read_files(output_dir)
>>> weights = scmkl.get_weights(rfiles)
>>> selection = scmkl.get_selection(weights)
397def mean_groups_per_alpha(selection_df) -> dict: 398 ''' 399 This function takes a pd.DataFrame from `scmkl.get_selection()` 400 generated from multiple scMKL results and returns a dictionary 401 with keys being alphas from the input dataframe and values being 402 the mean number of selected groups for a given alpha across 403 results. 404 405 Parameters 406 ---------- 407 **selection_df** : *pd.DataFrame* 408 > A dataframe output by `scmkl.get_selection()` with cols 409 `['Alpha', 'Group', Selection]. 410 411 Returns 412 ------- 413 **mean_groups** : *dict* 414 > A dictionary with alphas as keys and the mean number of 415 selected groups for that alpha as keys. 416 417 Examples 418 -------- 419 >>> weights = scmkl.get_weights(rfiles) 420 >>> selection = scmkl.get_selection(weights) 421 >>> mean_groups = scmkl.mean_groups_per_alpha(selection) 422 >>> mean_groups = {alpha : np.round(num_selected, 1) 423 ... for alpha, num_selected in mean_groups.items()} 424 >>> 425 >>> print(mean_groups) 426 {0.05 : 50.0, 0.2 : 24.7, 1.1 : 5.3} 427 ''' 428 mean_groups = {} 429 for alpha in np.unique(selection_df['Alpha']): 430 431 # Capturing rows for given alpha 432 rows = selection_df['Alpha'] == alpha 433 434 # Adding mean number of groups for alpha 435 mean_groups[alpha] = np.mean(selection_df[rows]['Selection']) 436 437 return mean_groups
This function takes a pd.DataFrame from scmkl.get_selection()
generated from multiple scMKL results and returns a dictionary
with keys being alphas from the input dataframe and values being
the mean number of selected groups for a given alpha across
results.
Parameters
selection_df : pd.DataFrame
A dataframe output by
scmkl.get_selection()
with cols `['Alpha', 'Group', Selection].
Returns
mean_groups : dict
A dictionary with alphas as keys and the mean number of selected groups for that alpha as keys.
Examples
>>> weights = scmkl.get_weights(rfiles)
>>> selection = scmkl.get_selection(weights)
>>> mean_groups = scmkl.mean_groups_per_alpha(selection)
>>> mean_groups = {alpha : np.round(num_selected, 1)
... for alpha, num_selected in mean_groups.items()}
>>>
>>> print(mean_groups)
{0.05 : 50.0, 0.2 : 24.7, 1.1 : 5.3}