from runners.experiment_utils import load_experiment, save_experiment
from src import logging
from runners.utils import load_yaml, flatten
from . import cmd, document_parser
import glob
import pandas as pd
import os
import copy
import numpy as np
from argparse import ArgumentParser
import gspread
from oauth2client.service_account import ServiceAccountCredentials
from gspread import WorksheetNotFound
[docs]def init_gsheet(credentials_path):
"""
Initializes the Google Sheets client given a path to credentials.
Args:
credentials_path (str): path to your Google credentials that are used to
authorize the Google Sheets access.
Returns:
:class:`gspread.Client`: Google Sheets Client initialized with credentials.
"""
scope = ['https://spreadsheets.google.com/feeds',
'https://www.googleapis.com/auth/drive']
credentials = ServiceAccountCredentials.from_json_keyfile_name(
credentials_path, scope
)
gc = gspread.authorize(credentials)
return gc
[docs]def upload_to_gsheet(results, config, exp=None, upload_source_metrics=False):
"""
Uploads the analysis to the Google Sheet, if possible.
Args:
results (:class:`pandas.DataFrame`): DataFrame containing all the results - output
by :py:func:`scripts.analyze.analyze`.
config (dict): Dictionary containing the entire experiment configuration.
exp (:class:`comet_ml.Experiment`): Experiment given by comet.ml (optional).
upload_source_metrics (bool): Uploads metrics for each source if True. Defaults to False.
Can have interactions with the API limit on Google Sheets. If there are two many
sources, then it will hit the limit and the script will break.
"""
credentials_path = os.getenv('PATH_TO_GOOGLE_CREDENTIALS', None)
if not credentials_path:
logging.info('PATH_TO_GOOGLE_CREDENTIALS not set, cannot proceed.')
return None
gc = init_gsheet(credentials_path)
config = copy.deepcopy(config)
sheet_name = config['info'].pop('spreadsheet_name', None)
worksheet_name = config['info'].pop('worksheet_name', None)
if not sheet_name or not worksheet_name:
logging.info('Sheet name not specified, not uploading results to Google sheets')
return None
logging.info(f'Opening {sheet_name} with {worksheet_name}')
sheet = gc.open(sheet_name)
try:
summary_worksheet = sheet.worksheet(worksheet_name)
except WorksheetNotFound:
logging.info(f'Worksheet not found, creating new sheet w/ name {worksheet_name}')
template_worksheet = sheet.worksheet('Template')
summary_worksheet = template_worksheet.duplicate(new_sheet_name=worksheet_name)
datasets = np.unique(results['dataset'])
metrics = ['SDR', 'SIR', 'SAR']
notes = config['info'].pop('notes', 'No notes')
def trunc(values, decs=0):
return np.trunc(values*10**decs)/(10**decs)
existing_rows = summary_worksheet.get_all_values()
for dataset in datasets:
logging.info(
f"Uploading results for {dataset} for {config['info']['experiment_key']} "
f"@ {worksheet_name} in {summary_worksheet}"
)
_results = results[results['dataset'] == dataset]
dataset_paths = {
key: config['datasets'][key]['folder']
for key in config['datasets']
}
experiment_key = config['info']['experiment_key']
experiment_url = 'No link'
if hasattr(exp, '_get_experiment_url'):
experiment_url = exp._get_experiment_url()
row_to_insert = [
f'=HYPERLINK("{experiment_url}", "{experiment_key}")',
notes,
dataset_paths.pop('train', 'No training'),
dataset_paths.pop('val', 'No validation.'),
dataset,
np.unique(_results['file_name']).shape[0],
]
row_exists = False
row_index = 3
for j, row in enumerate(existing_rows):
compared_indices = [2, 3, 4]
row = [row[0]] + [row[i] for i in compared_indices]
inserted_row = (
[config['info']['experiment_key']] +
[str(row_to_insert[i]) for i in compared_indices]
)
if (row == inserted_row):
logging.info("Row already exists")
row_exists = True
row_index = j + 1
break
if not row_exists:
summary_worksheet.insert_row(
row_to_insert, index=3, value_input_option='USER_ENTERED'
)
overall_metrics = (
[np.unique(_results['file_name']).shape[0]] +
[trunc(x, decs=2) for x in _results.mean()[metrics]]
)
overall_index = summary_worksheet.find('Overall').col - 1
for i, value in enumerate(overall_metrics):
summary_worksheet.update_cell(row_index, overall_index + i, value)
if upload_source_metrics:
try:
source_names = np.unique(_results['source_name']).tolist()
for source_name in source_names:
source_metrics = []
try:
source_name_cell = summary_worksheet.find(source_name)
except Exception as e:
source_name_cell = summary_worksheet.find('Source')
source_name_cell.value = source_name
summary_worksheet.update_cells([source_name_cell])
for i, metric in enumerate(metrics):
value = trunc(
_results[_results['source_name'] == source_name].mean()[metric],
decs=2
)
summary_worksheet.update_cell(
row_index, source_name_cell.col + i, value
)
except:
logging.info("Failure in uploading. Likely too many unique sources and we hit an API limit.")
pass
[docs]def analyze(path_to_yml_file, use_gsheet=False, upload_source_metrics=False):
"""
Analyzes the metrics for all the files that were evaluated in the experiment.
Args:
path_to_yml_file (str): Path to the yml file that defines the experiment. The
corresponding results folder for the experiment is what will be analyzed and put
into a Pandas dataframe.
use_gsheet (bool, optional): Whether or not to upload to the Google Sheet.
Defaults to False.
upload_source_metrics (bool): Uploads metrics for each source if True. Defaults to False.
Can have interactions with the API limit on Google Sheets. If there are two many
sources, then it will hit the limit and the script will break.
Returns:
tuple: 3-element tuple containing
- results (:class:`pandas.DataFrame`): DataFrame containing all of the results
for every file evaluated in the experiment. The DataFrame also has every
key in the experiment configuration in flattened format.
For example, model_config_recurrent_stack_args_embedding_size is a column in the DataFrame.
- config (*dict*): A dictionary containing the configuration of the experiment.
- exp (:class:`comet_ml.Experiment`): An instantiated experiment if comet.ml is needed, otherwise it is None.
"""
config, exp, path_to_yml_file = load_experiment(path_to_yml_file)
paths = glob.glob(
os.path.join(config['info']['output_folder'], 'results', '**.yml'),
recursive=True
)
results = []
for _path in paths:
data = load_yaml(_path, [])
for _data in data:
keys = sorted(list(_data.keys()))
keys.remove('permutation')
for key in keys:
flattened = {
'experiment_key': config['info']['experiment_key'],
'notes': config['info']['notes'],
'file_name': _path,
'dataset': config['datasets']['test']['folder'],
'source_name': key.split('/')[-1],
}
flattened.update(flatten(config))
for metric in _data[key]:
flattened[metric] = np.mean(_data[key][metric])
results.append(flattened)
results = pd.DataFrame(results)
logging.info(results.mean())
logging.info(config['info']['experiment_key'])
if use_gsheet:
upload_to_gsheet(results, config, exp, upload_source_metrics)
return results, config, exp
[docs]@document_parser('analyze', 'scripts.analyze.analyze')
def build_parser():
parser = ArgumentParser()
parser.add_argument(
"-p",
"--path_to_yml_file",
type=str,
required=True,
help="""Path to the configuration for the experiment that is getting analyzed. The
corresponding results folder for the experiment is what will be analyzed and put
into a Pandas dataframe.
"""
)
parser.add_argument(
"--use_gsheet",
action="store_true",
help="""Results can be synced to a Google sheet after analysis if this is true.
Defaults to false.
"""
)
parser.add_argument(
"--upload_source_metrics",
action="store_true",
help="""Uploads metrics for each source if True. Defaults to False.
Can have interactions with the API limit on Google Sheets. If there are two many
sources, then it will hit the limit and the script will break.
"""
)
return parser
if __name__ == '__main__':
cmd(analyze, build_parser)