From 2c47332ee3a5a948bba4262b1d9ad6c67011748b Mon Sep 17 00:00:00 2001 From: joncrall Date: Mon, 9 May 2022 13:54:30 -0400 Subject: [PATCH 01/25] working on benchmark framework with t-test analysis --- tests/benchmark3.py | 930 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 930 insertions(+) create mode 100644 tests/benchmark3.py diff --git a/tests/benchmark3.py b/tests/benchmark3.py new file mode 100644 index 0000000..adcc44e --- /dev/null +++ b/tests/benchmark3.py @@ -0,0 +1,930 @@ +""" +Roadmap: + + - [ ] +""" + +import random +import sys + +import timerit +import ubelt as ub + +import pandas as pd +import ujson +import json + +import kwarray +import warnings +import math +import scipy +import numpy as np +import itertools as it +import scipy.stats # NOQA + + +def data_lut(input, size): + if input == "Array with UTF-8 strings": + test_object = [] + for x in range(size): + test_object.append( + "نظام الحكم سلطاني وراثي " + "في الذكور من ذرية السيد تركي بن سعيد بن سلطان ويشترط فيمن يختار لولاية" + " الحكم من بينهم ان يكون مسلما رشيدا عاقلا ًوابنا شرعيا لابوين عمانيين " + ) + return test_object + elif input == "Array with doubles": + test_object = [] + for x in range(256): + test_object.append(sys.maxsize * random.random()) + else: + raise KeyError(input) + + +def get_instance_info(): + """ + Get information about the machine and version of the library we are running + the benchmarks on. + + Requirements: + cpuinfo + """ + import cpuinfo + import datetime + start_time = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat() + cpu_brand = cpuinfo.get_cpu_info()['brand_raw'] + instance_info = { + 'cpu_brand': cpu_brand, + 'start_time': start_time, + } + return instance_info + + +def benchmark_json_dumps(): + + JSON_IMPLS = { + "ujson": ujson, # Our json + "json": json, # Python's json + } + + if True: + import nujson + + JSON_IMPLS["nujson"] = nujson + import orjson + + JSON_IMPLS["nujson"] = orjson + import simplejson + + JSON_IMPLS["simplejson"] = simplejson + + version_infos = {k: v.__version__ for k, v in JSON_IMPLS.items()} + + def method_lut(impl): + return JSON_IMPLS[impl].dumps + + # Change params here to modify number of trials + ti = timerit.Timerit(1000, bestof=10, verbose=1) + + # if True, record every trail run and show variance in seaborn + # if False, use the standard timerit min/mean measures + RECORD_ALL = 1 + + # These are the parameters that we benchmark over + basis = { + "input": [ + "Array with UTF-8 strings", + "Array with doubles", + ], + "size": [1, 32, 256, 1024, 2048], + "impl": list(JSON_IMPLS.keys()), + } + xlabel = "size" + # Set these to empty lists if they are not used + group_labels = { + "col": ["input"], + "hue": ["impl"], + "size": [], + } + grid_iter = list(ub.named_product(basis)) + + instance_info = get_instance_info() + + # For each variation of your experiment, create a row. + rows = [] + for params in grid_iter: + group_keys = {} + for gname, labels in group_labels.items(): + group_keys[gname + "_key"] = ub.repr2( + ub.dict_isect(params, labels), compact=1, si=1 + ) + key = ub.repr2(params, compact=1, si=1) + # Make any modifications you need to compute input kwargs for each + # method here. + impl = params["impl"] + impl_version = version_infos[impl] + params["impl_version"] = impl_version + method = method_lut(impl) + data = data_lut(params["input"], params["size"]) + # Timerit will run some user-specified number of loops. + # and compute time stats with similar methodology to timeit + for timer in ti.reset(key): + # Put any setup logic you dont want to time here. + # ... + with timer: + # Put the logic you want to time here + method(data) + + if RECORD_ALL: + # Seaborn will show the variance if this is enabled, otherwise + # use the robust timerit mean / min times + # chunk_iter = ub.chunks(ti.times, ti.bestof) + # times = list(map(min, chunk_iter)) # TODO: timerit method for this + times = ti.robust_times() + for time in times: + row = { + "time": time, + "key": key, + "ti_bestof": ti.bestof, + **instance_info, + **group_keys, + **params, + } + rows.append(row) + else: + row = { + "mean": ti.mean(), + "std": ti.std(), + "min": ti.min(), + "key": key, + "ti_num": ti.num, + "ti_bestof": ti.bestof, + **instance_info, + **group_keys, + **params, + } + rows.append(row) + + bench_results_dpath = ub.Path(ujson.__file__).parent / 'benchmark_results' + bench_results_dpath.ensuredir() + timestamp = instance_info['start_time'].replace(':', '') + bench_results_fpath = bench_results_dpath / 'benchmarks_{}.json'.format(timestamp) + + with open(bench_results_fpath, 'w') as file: + json.dump(rows, file) + + benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL) + + +def benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL): + + USE_OPENSKILL = True + + time_key = "time" if RECORD_ALL else "min" + + # The rows define a long-form pandas data array. + # Data in long-form makes it very easy to use seaborn. + data = pd.DataFrame(rows) + data = data.sort_values(time_key) + + if RECORD_ALL: + # Show the min / mean if we record all + min_times = data.groupby("key").min().rename({"time": "min"}, axis=1) + mean_times = ( + data.groupby("key")[["time"]].mean().rename({"time": "mean"}, axis=1) + ) + stats_data = pd.concat([min_times, mean_times], axis=1) + stats_data = stats_data.sort_values("min") + else: + stats_data = data + + if USE_OPENSKILL: + # Track the "skill" of each method + # The idea is that each setting of parameters is a game, and each + # "impl" is a player. We rank the players by which is fastest, and + # update their ranking according to the Weng-Lin Bayes ranking model. + # This does not take the fact that some "games" (i.e. parameter + # settings) are more important than others, but it should be fairly + # robust on average. + skillboard = SkillTracker(basis["impl"]) + + other_keys = sorted( + set(stats_data.columns) + - {"key", "impl", "min", "mean", "hue_key", "size_key", "style_key"} + ) + for params, variants in stats_data.groupby(other_keys): + variants = variants.sort_values("mean") + ranking = variants["impl"].reset_index(drop=True) + + mean_speedup = variants["mean"].max() / variants["mean"] + stats_data.loc[mean_speedup.index, "mean_speedup"] = mean_speedup + min_speedup = variants["min"].max() / variants["min"] + stats_data.loc[min_speedup.index, "min_speedup"] = min_speedup + + if USE_OPENSKILL: + skillboard.observe(ranking) + + print("Statistics:") + print(stats_data) + + if USE_OPENSKILL: + win_probs = skillboard.predict_win() + win_probs = ub.sorted_vals(win_probs, reverse=True) + print( + "Aggregated Rankings = {}".format( + ub.repr2(win_probs, nl=1, precision=4, align=":") + ) + ) + + plot = True + if plot: + # import seaborn as sns + # kwplot autosns works well for IPython and script execution. + # not sure about notebooks. + import seaborn as sns + + sns.set() + from matplotlib import pyplot as plt + + plotkw = {} + for gname, labels in group_labels.items(): + if labels: + plotkw[gname] = gname + "_key" + + # Your variables may change + # ax = plt.figure().gca() + col = plotkw.pop("col") + facet = sns.FacetGrid(data, col=col, sharex=False, sharey=False) + facet.map_dataframe(sns.lineplot, x=xlabel, y=time_key, marker="o", **plotkw) + facet.add_legend() + # sns.lineplot(data=data, ) + # ax.set_title('JSON Benchmarks') + # ax.set_xlabel('Size') + # ax.set_ylabel('Time') + # ax.set_xscale('log') + # ax.set_yscale('log') + + try: + __IPYTHON__ + except NameError: + plt.show() + + +class Result(ub.NiceRepr): + """ + Storage of names, parameters, and quality metrics for a single experiment. + + Attributes: + name (str | None): + Name of the experiment. Optional. This is unused in the analysis. + (i.e. names will never be used computationally. Use them for keys) + + params (Dict[str, object]): configuration of the experiment. + This is a dictionary mapping a parameter name to its value. + + metrics (Dict[str, float]): quantitative results of the experiment + This is a dictionary for each quality metric computed on this + result. + + meta (Dict | None): any other metadata about this result. + This is unused in the analysis. + + Example: + >>> self = Result.demo(rng=32) + >>> print('self = {}'.format(self)) + self = + """ + def __init__(self, name, params, metrics, meta=None): + self.name = name + self.params = params + self.metrics = metrics + self.meta = meta + + def to_dict(self): + row = ub.dict_union({'name': self.name}, self.metrics, self.params) + return row + + def __nice__(self): + row = self.to_dict() + text = ub.repr2(row, compact=True, precision=2, sort=0) + return text + + @classmethod + def demo(cls, rng=None): + import numpy as np + import string + rng = kwarray.ensure_rng(rng) + demo_param_space = { + 'param1': list(range(3)), + 'param2': np.linspace(0, 10, 10), + 'param3': list(string.ascii_lowercase[0:3]), + } + params = {k: rng.choice(b) for k, b in demo_param_space.items()} + metrics = { + 'f1': rng.rand(), + 'acc': rng.rand(), + } + name = ub.hash_data(params)[0:8] + self = cls(name, params, metrics) + return self + + +class ResultAnalysis(ub.NiceRepr): + """ + Groups and runs stats on results + + Runs statistical tests on sets of configuration-metrics pairs + + Attributes: + results (List[Result]): list of results + + ignore_metrics (Set[str]): metrics to ignore + + ignore_params (Set[str]): parameters to ignore + + metric_objectives (Dict[str, str]): + indicate if each metrix should be maximized "max" or minimized + "min" + + metrics (List[str]): + only consider these metrics + + abalation_orders (Set[int]): + The number of parameters to be held constant in each statistical + grouping. Defaults to 1, so it groups together results where 1 + variable is held constant. Including 2 will include pairwise + settings of parameters to be held constant. Using -1 or -2 means + all but 1 or 2 parameters will be held constant, repsectively. + + default_objective (str): + assume max or min for unknown metrics + + Example: + >>> self = ResultAnalysis.demo() + >>> self.analysis() + + Example: + >>> # Given a list of experiments, configs, and results + >>> # Create a ResultAnalysis object + >>> results = ResultAnalysis([ + >>> Result('expt0', {'param1': 2, 'param3': 'b'}, {'f1': 0.75}), + >>> Result('expt1', {'param1': 0, 'param3': 'c'}, {'f1': 0.92}), + >>> Result('expt2', {'param1': 1, 'param3': 'b'}, {'f1': 0.77}), + >>> Result('expt3', {'param1': 1, 'param3': 'a'}, {'f1': 0.67}), + >>> Result('expt4', {'param1': 0, 'param3': 'c'}, {'f1': 0.98}), + >>> Result('expt5', {'param1': 2, 'param3': 'a'}, {'f1': 0.86}), + >>> Result('expt6', {'param1': 1, 'param3': 'c'}, {'f1': 0.77}), + >>> Result('expt7', {'param1': 1, 'param3': 'c'}, {'f1': 0.41}), + >>> Result('expt8', {'param1': 1, 'param3': 'a'}, {'f1': 0.64}), + >>> Result('expt9', {'param1': 0, 'param3': 'b'}, {'f1': 0.95}), + >>> ]) + >>> # Calling the analysis method prints something like the following + >>> results.analysis() + + PARAMETER 'param1' - f1 + ======================= + f1 mean std max min num best + param1 + 0 0.950 0.030000 0.98 0.92 3.0 0.98 + 2 0.805 0.077782 0.86 0.75 2.0 0.86 + 1 0.652 0.147377 0.77 0.41 5.0 0.77 + + ANOVA hypothesis (roughly): the param 'param1' has no effect on the metric + Reject this hypothesis if the p value is less than a threshold + Rank-ANOVA: p=0.0397 + Mean-ANOVA: p=0.0277 + + Pairwise T-Tests + Is param1=0 about as good as param1=2? + ttest_ind: p=0.2058 + Is param1=1 about as good as param1=2? + ttest_ind: p=0.1508 + + + PARAMETER 'param3' - f1 + ======================= + f1 mean std max min num best + param3 + c 0.770000 0.255734 0.98 0.41 4.0 0.98 + b 0.823333 0.110151 0.95 0.75 3.0 0.95 + a 0.723333 0.119304 0.86 0.64 3.0 0.86 + + ANOVA hypothesis (roughly): the param 'param3' has no effect on the metric + Reject this hypothesis if the p value is less than a threshold + Rank-ANOVA: p=0.5890 + Mean-ANOVA: p=0.8145 + + Pairwise T-Tests + Is param3=b about as good as param3=c? + ttest_ind: p=0.7266 + Is param3=a about as good as param3=b? + ttest_ind: p=0.3466 + ttest_rel: p=0.3466 + Is param3=a about as good as param3=c? + ttest_ind: p=0.7626 + """ + + def __init__(self, results, metrics=None, ignore_params=None, + ignore_metrics=None, metric_objectives=None, + abalation_orders={1}, default_objective='max'): + self.results = results + if ignore_metrics is None: + ignore_metrics = set() + if ignore_params is None: + ignore_params = set() + self.ignore_params = ignore_params + self.ignore_metrics = ignore_metrics + + self.abalation_orders = abalation_orders + self.default_objective = default_objective + + # encode if we want to maximize or minimize a metric + default_metric_to_objective = { + 'ap': 'max', + 'acc': 'max', + 'f1': 'max', + # + 'loss': 'min', + 'brier': 'min', + } + if metric_objectives is None: + metric_objectives = {} + + self.metric_objectives = default_metric_to_objective.copy() + self.metric_objectives.update(metric_objectives) + + self.metrics = metrics + self.statistics = None + + self._description = {} + self._description['built'] = False + self._description['num_results'] = len(self.results) + + def __nice__(self): + # if len(self._description) == 0: + # return 'unbuilt' + # else: + return ub.repr2(self._description, si=1, sv=1) + + @classmethod + def demo(cls, num=10, rng=None): + rng = kwarray.ensure_rng(rng) + results = [Result.demo(rng=rng) for _ in range(num)] + self = cls(results, metrics={'f1', 'acc'}) + return self + + def run(self): + self.build() + self.report() + + def analysis(self): + # alias for run + return self.run() + self.build() + self.report() + + @ub.memoize_property + def table(self): + rows = [r.to_dict() for r in self.results] + table = pd.DataFrame(rows) + return table + + def metric_table(self): + rows = [r.to_dict() for r in self.results] + table = pd.DataFrame(rows) + return table + + @ub.memoize_property + def varied(self): + config_rows = [r.params for r in self.results] + sentinel = object() + # pd.DataFrame(config_rows).channels + varied = dict(ub.varied_values(config_rows, default=sentinel, min_variations=1)) + # remove nans + varied = { + k: {v for v in vs if not (isinstance(v, float) and math.isnan(v))} + for k, vs in varied.items()} + varied = {k: vs for k, vs in varied.items() if len(vs)} + return varied + + def abalation_groups(self, param): + """ + Example: + >>> self = ResultAnalysis.demo() + >>> param = 'param2' + >>> self.abalation_groups(param) + """ + table = self.table + config_rows = [r.params for r in self.results] + config_keys = list(map(set, config_rows)) + if self.ignore_params: + config_keys = [c - self.ignore_params for c in config_keys] + isect_params = set.intersection(*config_keys) + other_params = sorted(isect_params - {param}) + groups = [] + for key, group in table.groupby(other_params, dropna=False): + if len(group) > 1: + groups.append(group) + return groups + + def abalate_one(self, param): + """ + Example: + >>> self = ResultAnalysis.demo() + >>> param = 'param2' + >>> # xdoctest: +REQUIRES(module:openskill) + >>> self.abalate_one(param) + """ + import itertools as it + if self.table is None: + self.table = self.build_table() + param_unique_vals = self.table[param].unique().tolist() + score_improvements = ub.ddict(list) + scored_obs = [] + skillboard = SkillTracker(param_unique_vals) + groups = self.abalation_groups(param) + + for group in groups: + for metric_key in self.metrics: + ascending = self._objective_is_ascending(metric_key) + + group = group.sort_values(metric_key, ascending=ascending) + subgroups = group.groupby(param) + if ascending: + best_idx = subgroups[metric_key].idxmax() + else: + best_idx = subgroups[metric_key].idxmin() + best_group = group.loc[best_idx] + best_group = best_group.sort_values(metric_key, ascending=ascending) + + for x1, x2 in it.product(best_group.index, best_group.index): + if x1 != x2: + r1 = best_group.loc[x1] + r2 = best_group.loc[x2] + k1 = r1[param] + k2 = r2[param] + diff = r1[metric_key] - r2[metric_key] + score_improvements[(k1, k2)].append(diff) + + # metric_vals = best_group[metric_key].values + # diffs = metric_vals[None, :] - metric_vals[:, None] + best_group.set_index(param) + # best_group[param] + # best_group[metric_key].diff() + scored_ranking = best_group[[param, metric_key]].reset_index(drop=True) + scored_obs.append(scored_ranking) + skillboard.observe(scored_ranking[param]) + + print('skillboard.ratings = {}'.format(ub.repr2(skillboard.ratings, nl=1, align=':'))) + win_probs = skillboard.predict_win() + print(f'win_probs={win_probs}') + for key, improves in score_improvements.items(): + k1, k2 = key + improves = np.array(improves) + pos_delta = improves[improves > 0] + print(f'\nWhen {param}={k1} is better than {param}={k2}') + print(pd.DataFrame([pd.Series(pos_delta).describe().T])) + return scored_obs + # self.varied[param] + + def _objective_is_ascending(self, metric_key): + """ + Return True if we should minimize the objective (lower is better) + Return False if we should maximize the objective (higher is better) + """ + objective = self.metric_objectives.get(metric_key, None) + if objective is None: + warnings.warn(f'warning assume {self.default_objective} for {metric_key=}') + objective = self.default_objective + ascending = (objective == 'min') + return ascending + + def test_group(self, param_group, metric_key): + """ + Get stats for a particular metric / constant group + + Args: + param_group (List[str]): group of parameters to hold constant. + metric_key (str): The metric to test. + + Returns: + dict + # TODO : document these stats clearly and accurately + + Example: + >>> self = ResultAnalysis.demo(num=30) + >>> print(self.table) + >>> param_group = ['param2'] + >>> metric_key = 'f1' + >>> stats_row = self.test_group(param_group, metric_key) + >>> print('stats_row = {}'.format(ub.repr2(stats_row, nl=2, precision=2))) + >>> # --- + >>> self.build() + >>> self.report() + """ + param_group_name = ','.join(param_group) + stats_row = { + 'param_name': param_group_name, + 'metric': metric_key, + } + # param_values = varied[param_name] + # stats_row['param_values'] = param_values + ascending = self._objective_is_ascending(metric_key) + + # Find all items with this particular param value + value_to_metric_group = {} + value_to_metric_stats = {} + value_to_metric = {} + + varied_cols = sorted(self.varied.keys()) + + # Not sure if this is the right name, these are the other param keys + # that we are not directly investigating, but might have an impact. + # We use these to select comparable rows for pairwise t-tests + nuisance_cols = sorted(set(self.varied.keys()) - set(param_group)) + + for param_value, group in self.table.groupby(param_group): + metric_group = group[['name', metric_key] + varied_cols] + metric_vals = metric_group[metric_key] + metric_vals = metric_vals.dropna() + if len(metric_vals) > 0: + metric_stats = metric_vals.describe() + value_to_metric_stats[param_value] = metric_stats + value_to_metric_group[param_value] = metric_group + value_to_metric[param_value] = metric_vals.values + + moments = pd.DataFrame(value_to_metric_stats).T + moments = moments.sort_values('mean', ascending=ascending) + moments.index.name = param_group_name + moments.columns.name = metric_key + ranking = moments['mean'].index.to_list() + param_to_rank = ub.invert_dict(dict(enumerate(ranking))) + + # Determine a set of value pairs to do pairwise comparisons on + value_pairs = ub.oset() + value_pairs.update(map(frozenset, ub.iter_window(moments.index, 2))) + value_pairs.update(map(frozenset, ub.iter_window(moments.sort_values('mean', ascending=ascending).index, 2))) + + # https://en.wikipedia.org/wiki/Kruskal%E2%80%93Wallis_one-way_analysis_of_variance + # If the researcher can make the assumptions of an identically + # shaped and scaled distribution for all groups, except for any + # difference in medians, then the null hypothesis is that the + # medians of all groups are equal, and the alternative + # hypothesis is that at least one population median of one + # group is different from the population median of at least one + # other group. + try: + anova_krus_result = scipy.stats.kruskal(*value_to_metric.values()) + except ValueError: + anova_krus_result = scipy.stats.stats.KruskalResult(np.nan, np.nan) + + # https://en.wikipedia.org/wiki/One-way_analysis_of_variance + # The One-Way ANOVA tests the null hypothesis, which states + # that samples in all groups are drawn from populations with + # the same mean values + if len(value_to_metric) > 1: + anova_1way_result = scipy.stats.f_oneway(*value_to_metric.values()) + else: + anova_1way_result = scipy.stats.stats.F_onewayResult(np.nan, np.nan) + + stats_row['anova_rank_H'] = anova_krus_result.statistic + stats_row['anova_rank_p'] = anova_krus_result.pvalue + stats_row['anova_mean_F'] = anova_1way_result.statistic + stats_row['anova_mean_p'] = anova_1way_result.pvalue + stats_row['moments'] = moments + + pairwise_statistics = [] + for pair in value_pairs: + pair_statistics = {} + # try: + # param_val1, param_val2 = sorted(pair) + # except Exception: + # param_val1, param_val2 = (pair) + param_val1, param_val2 = pair + + metric_vals1 = value_to_metric[param_val1] + metric_vals2 = value_to_metric[param_val2] + + rank1 = param_to_rank[param_val1] + rank2 = param_to_rank[param_val2] + pair_statistics['winner'] = param_val1 if rank1 < rank2 else param_val2 + pair_statistics['value1'] = param_val1 + pair_statistics['value2'] = param_val2 + pair_statistics['n1'] = len(metric_vals1) + pair_statistics['n2'] = len(metric_vals2) + ttest_ind_result = scipy.stats.ttest_ind(metric_vals1, metric_vals2, equal_var=False) + pair_statistics['ttest_ind'] = ttest_ind_result + + # Do relative checks, need to find comparable subgroups + metric_group1 = value_to_metric_group[param_val1] + metric_group2 = value_to_metric_group[param_val2] + nuisance_vals1 = metric_group1[nuisance_cols] + nuisance_vals2 = metric_group2[nuisance_cols] + nk_to_group1 = dict(list(nuisance_vals1.groupby(nuisance_cols))) + nk_to_group2 = dict(list(nuisance_vals2.groupby(nuisance_cols))) + common = set(nk_to_group1) & set(nk_to_group2) + comparable_indexes1 = [] + comparable_indexes2 = [] + if common: + for nk in common: + group1 = nk_to_group1[nk] + group2 = nk_to_group2[nk] + for i, j in it.product(group1.index, group2.index): + comparable_indexes1.append(i) + comparable_indexes2.append(j) + + comparable_groups1 = metric_group1.loc[comparable_indexes1, metric_key] + comparable_groups2 = metric_group2.loc[comparable_indexes2, metric_key] + + # Does this need to have the values aligned? + ttest_rel_result = scipy.stats.ttest_rel(comparable_groups1, comparable_groups2) + pair_statistics['ttest_rel'] = ttest_rel_result + pairwise_statistics.append(pair_statistics) + + stats_row['pairwise'] = pairwise_statistics + return stats_row + + def build(self): + import itertools as it + if len(self.results) < 2: + raise Exception('need at least 2 results') + + varied = self.varied.copy() + if self.ignore_params: + for k in self.ignore_params: + varied.pop(k, None) + + # Experimental: + # Find Auto-abalation groups + # TODO: when the group size is -1, instead of showing all of the group + # settings, for each group setting do the k=1 analysis within that group + varied_param_names = list(varied.keys()) + num_varied_params = len(varied) + held_constant_orders = {num_varied_params + i if i < 0 else i for i in self.abalation_orders} + held_constant_orders = [i for i in held_constant_orders if i > 0] + held_constant_groups = [] + for k in held_constant_orders: + held_constant_groups.extend( + list(map(list, it.combinations(varied_param_names, k)))) + + if self.metrics is None: + avail_metrics = set.intersection(*[set(r.metrics.keys()) for r in self.results]) + metrics_of_interest = sorted(avail_metrics - set(self.ignore_metrics)) + else: + metrics_of_interest = self.metrics + self.metrics_of_interest = metrics_of_interest + self._description['metrics_of_interest'] = metrics_of_interest + self._description['num_groups'] = len(held_constant_groups) + + # Analyze the impact of each parameter + self.statistics = statistics = [] + for param_group in held_constant_groups: + for metric_key in metrics_of_interest: + stats_row = self.test_group(param_group, metric_key) + statistics.append(stats_row) + + self.stats_table = pd.DataFrame([ + ub.dict_diff(d, {'pairwise', 'param_values', 'moments'}) + for d in self.statistics]) + + if len(self.stats_table): + self.stats_table = self.stats_table.sort_values('anova_rank_p') + + self._description['built'] = True + + def report(self): + p_threshold = 0.05 + stat_groups = ub.group_items(self.statistics, key=lambda x: x['param_name']) + stat_groups_items = list(stat_groups.items()) + + # Modify this order to change the grouping pattern + grid = ub.named_product({ + 'stat_group_item': stat_groups_items, + 'metrics': self.metrics_of_interest, + }) + for grid_item in grid: + metric_key = grid_item['metrics'] + stat_groups_item = grid_item['stat_group_item'] + + param_name, stat_group = stat_groups_item + stats_row = ub.group_items(stat_group, key=lambda x: x['metric'])[metric_key][0] + title = ('PARAMETER {!r} - {}'.format(param_name, metric_key)) + print('\n\n') + print(title) + print('=' * len(title)) + print(stats_row['moments']) + anova_rank_p = stats_row['anova_rank_p'] + anova_mean_p = stats_row['anova_mean_p'] + # Rougly speaking + print('') + print(f'ANOVA: If p is low, the param {param_name!r} might have an effect') + print(ub.color_text(f' Rank-ANOVA: p={anova_rank_p:0.8f}', 'green' if anova_rank_p < p_threshold else None)) + print(ub.color_text(f' Mean-ANOVA: p={anova_mean_p:0.8f}', 'green' if anova_mean_p < p_threshold else None)) + print('') + print('Pairwise T-Tests') + for pairstat in stats_row['pairwise']: + # Is this backwards? + value1 = pairstat['value1'] + value2 = pairstat['value2'] + winner = pairstat['winner'] + if value2 == winner: + value1, value2 = value2, value1 + print(f' If p is low, {param_name}={value1} may outperform {param_name}={value2}.') + if 'ttest_ind' in pairstat: + ttest_ind_result = pairstat['ttest_ind'] + print(ub.color_text(f' ttest_ind: p={ttest_ind_result.pvalue:0.8f}', 'green' if ttest_ind_result.pvalue < p_threshold else None)) + if 'ttest_rel' in pairstat: + ttest_rel_result = pairstat['ttest_ind'] + print(ub.color_text(f' ttest_rel: p={ttest_rel_result.pvalue:0.8f}', 'green' if ttest_rel_result.pvalue < p_threshold else None)) + + print(self.stats_table) + + def conclusions(self): + conclusions = [] + for stat in self.statistics: + param_name = stat['param_name'] + metric = stat['metric'] + for pairstat in stat['pairwise']: + value1 = pairstat['value1'] + value2 = pairstat['value2'] + winner = pairstat['winner'] + if value2 == winner: + value1, value2 = value2, value1 + pvalue = stat = pairstat['ttest_ind'].pvalue + txt = (f'p={pvalue:0.8f}, If p is low, {param_name}={value1} may outperform {value2} on {metric}.') + conclusions.append(txt) + return conclusions + + +class SkillTracker: + """ + Wrapper around openskill + + Args: + player_ids (List[T]): + a list of ids (usually ints) used to represent each player + + Example: + >>> # xdoctest: +REQUIRES(module:openskill) + >>> self = SkillTracker([1, 2, 3, 4, 5]) + >>> self.observe([2, 3]) # Player 2 beat player 3. + >>> self.observe([1, 2, 5, 3]) # Player 3 didnt play this round. + >>> self.observe([2, 3, 4, 5, 1]) # Everyone played, player 2 won. + >>> win_probs = self.predict_win() + >>> print('win_probs = {}'.format(ub.repr2(win_probs, nl=1, precision=2))) + win_probs = { + 1: 0.20, + 2: 0.21, + 3: 0.19, + 4: 0.20, + 5: 0.20, + } + """ + + def __init__(self, player_ids): + import openskill + self.player_ids = player_ids + self.ratings = {m: openskill.Rating() for m in player_ids} + self.observations = [] + + def predict_win(self): + """ + Estimate the probability that a particular player will win given the + current ratings. + + Returns: + Dict[T, float]: mapping from player ids to win probabilites + """ + from openskill import predict_win + teams = [[p] for p in list(self.ratings.keys())] + ratings = [[r] for r in self.ratings.values()] + probs = predict_win(ratings) + win_probs = {team[0]: prob for team, prob in zip(teams, probs)} + return win_probs + + def observe(self, ranking): + """ + After simulating a round, pass the ranked order of who won + (winner is first, looser is last) to this function. And it + updates the rankings. + + Args: + ranking (List[T]): + ranking of all the players that played in this round + winners are at the front (0-th place) of the list. + """ + import openskill + self.observations.append(ranking) + ratings = self.ratings + team_standings = [[r] for r in ub.take(ratings, ranking)] + new_values = openskill.rate(team_standings) # Not inplace + new_ratings = [openskill.Rating(*new[0]) for new in new_values] + ratings.update(ub.dzip(ranking, new_ratings)) + + +if __name__ == "__main__": + """ + CommandLine: + python ~/code/ultrajson/tests/benchmark3.py + """ + benchmark_json_dumps() From da6428296d2ace46f822ed86a703a88226ad1cea Mon Sep 17 00:00:00 2001 From: joncrall Date: Mon, 23 May 2022 00:52:30 -0400 Subject: [PATCH 02/25] Working on benchmarks with details statistical analysis --- tests/benchmark3.py | 918 ++------------------------- tests/benchmarker/__init__.py | 35 + tests/benchmarker/_test_ttest.py | 28 + tests/benchmarker/aggregate.py | 68 ++ tests/benchmarker/benchmarker.py | 230 +++++++ tests/benchmarker/process_context.py | 103 +++ tests/benchmarker/result_analysis.py | 722 +++++++++++++++++++++ tests/benchmarker/util_json.py | 233 +++++++ tests/benchmarker/visualize.py | 113 ++++ 9 files changed, 1596 insertions(+), 854 deletions(-) create mode 100644 tests/benchmarker/__init__.py create mode 100644 tests/benchmarker/_test_ttest.py create mode 100644 tests/benchmarker/aggregate.py create mode 100644 tests/benchmarker/benchmarker.py create mode 100644 tests/benchmarker/process_context.py create mode 100644 tests/benchmarker/result_analysis.py create mode 100644 tests/benchmarker/util_json.py create mode 100644 tests/benchmarker/visualize.py diff --git a/tests/benchmark3.py b/tests/benchmark3.py index adcc44e..181b2a4 100644 --- a/tests/benchmark3.py +++ b/tests/benchmark3.py @@ -3,25 +3,10 @@ Roadmap: - [ ] """ - import random import sys - -import timerit import ubelt as ub -import pandas as pd -import ujson -import json - -import kwarray -import warnings -import math -import scipy -import numpy as np -import itertools as it -import scipy.stats # NOQA - def data_lut(input, size): if input == "Array with UTF-8 strings": @@ -41,55 +26,54 @@ def data_lut(input, size): raise KeyError(input) -def get_instance_info(): - """ - Get information about the machine and version of the library we are running - the benchmarks on. +def available_json_impls(): + JSON_IMPLS = {} - Requirements: - cpuinfo - """ - import cpuinfo - import datetime - start_time = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat() - cpu_brand = cpuinfo.get_cpu_info()['brand_raw'] - instance_info = { - 'cpu_brand': cpu_brand, - 'start_time': start_time, - } - return instance_info + try: + import json + JSON_IMPLS["json"] = json + except ImportError: + pass + + try: + import ujson + JSON_IMPLS["ujson"] = ujson + except ImportError: + pass + + try: + import nujson + JSON_IMPLS["nujson"] = nujson + except ImportError: + pass + + try: + import orjson + JSON_IMPLS["nujson"] = orjson + except ImportError: + pass + + try: + import simplejson + JSON_IMPLS["simplejson"] = simplejson + except ImportError: + pass + + return JSON_IMPLS def benchmark_json_dumps(): + # TODO: remove this hack + sys.path.append(ub.expandpath('~/code/ultrajson/tests')) + from benchmarker import Benchmarker - JSON_IMPLS = { - "ujson": ujson, # Our json - "json": json, # Python's json - } - - if True: - import nujson - - JSON_IMPLS["nujson"] = nujson - import orjson - - JSON_IMPLS["nujson"] = orjson - import simplejson - - JSON_IMPLS["simplejson"] = simplejson + JSON_IMPLS = available_json_impls() version_infos = {k: v.__version__ for k, v in JSON_IMPLS.items()} def method_lut(impl): return JSON_IMPLS[impl].dumps - # Change params here to modify number of trials - ti = timerit.Timerit(1000, bestof=10, verbose=1) - - # if True, record every trail run and show variance in seaborn - # if False, use the standard timerit min/mean measures - RECORD_ALL = 1 - # These are the parameters that we benchmark over basis = { "input": [ @@ -99,26 +83,17 @@ def benchmark_json_dumps(): "size": [1, 32, 256, 1024, 2048], "impl": list(JSON_IMPLS.keys()), } - xlabel = "size" - # Set these to empty lists if they are not used - group_labels = { - "col": ["input"], - "hue": ["impl"], - "size": [], - } - grid_iter = list(ub.named_product(basis)) - instance_info = get_instance_info() + benchmark = Benchmarker( + name='bench_json_dumps', + # Change params here to modify number of trials + num=100, + bestof=10, + basis=basis, + ) # For each variation of your experiment, create a row. - rows = [] - for params in grid_iter: - group_keys = {} - for gname, labels in group_labels.items(): - group_keys[gname + "_key"] = ub.repr2( - ub.dict_isect(params, labels), compact=1, si=1 - ) - key = ub.repr2(params, compact=1, si=1) + for params in benchmark.iter_params(): # Make any modifications you need to compute input kwargs for each # method here. impl = params["impl"] @@ -128,798 +103,33 @@ def benchmark_json_dumps(): data = data_lut(params["input"], params["size"]) # Timerit will run some user-specified number of loops. # and compute time stats with similar methodology to timeit - for timer in ti.reset(key): + for timer in benchmark.measure(): # Put any setup logic you dont want to time here. # ... with timer: # Put the logic you want to time here method(data) - if RECORD_ALL: - # Seaborn will show the variance if this is enabled, otherwise - # use the robust timerit mean / min times - # chunk_iter = ub.chunks(ti.times, ti.bestof) - # times = list(map(min, chunk_iter)) # TODO: timerit method for this - times = ti.robust_times() - for time in times: - row = { - "time": time, - "key": key, - "ti_bestof": ti.bestof, - **instance_info, - **group_keys, - **params, - } - rows.append(row) - else: - row = { - "mean": ti.mean(), - "std": ti.std(), - "min": ti.min(), - "key": key, - "ti_num": ti.num, - "ti_bestof": ti.bestof, - **instance_info, - **group_keys, - **params, - } - rows.append(row) - - bench_results_dpath = ub.Path(ujson.__file__).parent / 'benchmark_results' - bench_results_dpath.ensuredir() - timestamp = instance_info['start_time'].replace(':', '') - bench_results_fpath = bench_results_dpath / 'benchmarks_{}.json'.format(timestamp) - - with open(bench_results_fpath, 'w') as file: - json.dump(rows, file) - - benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL) - - -def benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL): - - USE_OPENSKILL = True - - time_key = "time" if RECORD_ALL else "min" - - # The rows define a long-form pandas data array. - # Data in long-form makes it very easy to use seaborn. - data = pd.DataFrame(rows) - data = data.sort_values(time_key) - - if RECORD_ALL: - # Show the min / mean if we record all - min_times = data.groupby("key").min().rename({"time": "min"}, axis=1) - mean_times = ( - data.groupby("key")[["time"]].mean().rename({"time": "mean"}, axis=1) - ) - stats_data = pd.concat([min_times, mean_times], axis=1) - stats_data = stats_data.sort_values("min") - else: - stats_data = data - - if USE_OPENSKILL: - # Track the "skill" of each method - # The idea is that each setting of parameters is a game, and each - # "impl" is a player. We rank the players by which is fastest, and - # update their ranking according to the Weng-Lin Bayes ranking model. - # This does not take the fact that some "games" (i.e. parameter - # settings) are more important than others, but it should be fairly - # robust on average. - skillboard = SkillTracker(basis["impl"]) - - other_keys = sorted( - set(stats_data.columns) - - {"key", "impl", "min", "mean", "hue_key", "size_key", "style_key"} - ) - for params, variants in stats_data.groupby(other_keys): - variants = variants.sort_values("mean") - ranking = variants["impl"].reset_index(drop=True) - - mean_speedup = variants["mean"].max() / variants["mean"] - stats_data.loc[mean_speedup.index, "mean_speedup"] = mean_speedup - min_speedup = variants["min"].max() / variants["min"] - stats_data.loc[min_speedup.index, "min_speedup"] = min_speedup - - if USE_OPENSKILL: - skillboard.observe(ranking) - - print("Statistics:") - print(stats_data) - - if USE_OPENSKILL: - win_probs = skillboard.predict_win() - win_probs = ub.sorted_vals(win_probs, reverse=True) - print( - "Aggregated Rankings = {}".format( - ub.repr2(win_probs, nl=1, precision=4, align=":") - ) - ) - - plot = True - if plot: - # import seaborn as sns - # kwplot autosns works well for IPython and script execution. - # not sure about notebooks. - import seaborn as sns - - sns.set() - from matplotlib import pyplot as plt - - plotkw = {} - for gname, labels in group_labels.items(): - if labels: - plotkw[gname] = gname + "_key" - - # Your variables may change - # ax = plt.figure().gca() - col = plotkw.pop("col") - facet = sns.FacetGrid(data, col=col, sharex=False, sharey=False) - facet.map_dataframe(sns.lineplot, x=xlabel, y=time_key, marker="o", **plotkw) - facet.add_legend() - # sns.lineplot(data=data, ) - # ax.set_title('JSON Benchmarks') - # ax.set_xlabel('Size') - # ax.set_ylabel('Time') - # ax.set_xscale('log') - # ax.set_yscale('log') - - try: - __IPYTHON__ - except NameError: - plt.show() - - -class Result(ub.NiceRepr): - """ - Storage of names, parameters, and quality metrics for a single experiment. - - Attributes: - name (str | None): - Name of the experiment. Optional. This is unused in the analysis. - (i.e. names will never be used computationally. Use them for keys) - - params (Dict[str, object]): configuration of the experiment. - This is a dictionary mapping a parameter name to its value. - - metrics (Dict[str, float]): quantitative results of the experiment - This is a dictionary for each quality metric computed on this - result. - - meta (Dict | None): any other metadata about this result. - This is unused in the analysis. - - Example: - >>> self = Result.demo(rng=32) - >>> print('self = {}'.format(self)) - self = - """ - def __init__(self, name, params, metrics, meta=None): - self.name = name - self.params = params - self.metrics = metrics - self.meta = meta - - def to_dict(self): - row = ub.dict_union({'name': self.name}, self.metrics, self.params) - return row - - def __nice__(self): - row = self.to_dict() - text = ub.repr2(row, compact=True, precision=2, sort=0) - return text - - @classmethod - def demo(cls, rng=None): - import numpy as np - import string - rng = kwarray.ensure_rng(rng) - demo_param_space = { - 'param1': list(range(3)), - 'param2': np.linspace(0, 10, 10), - 'param3': list(string.ascii_lowercase[0:3]), - } - params = {k: rng.choice(b) for k, b in demo_param_space.items()} - metrics = { - 'f1': rng.rand(), - 'acc': rng.rand(), - } - name = ub.hash_data(params)[0:8] - self = cls(name, params, metrics) - return self - - -class ResultAnalysis(ub.NiceRepr): - """ - Groups and runs stats on results - - Runs statistical tests on sets of configuration-metrics pairs - - Attributes: - results (List[Result]): list of results - - ignore_metrics (Set[str]): metrics to ignore - - ignore_params (Set[str]): parameters to ignore - - metric_objectives (Dict[str, str]): - indicate if each metrix should be maximized "max" or minimized - "min" - - metrics (List[str]): - only consider these metrics - - abalation_orders (Set[int]): - The number of parameters to be held constant in each statistical - grouping. Defaults to 1, so it groups together results where 1 - variable is held constant. Including 2 will include pairwise - settings of parameters to be held constant. Using -1 or -2 means - all but 1 or 2 parameters will be held constant, repsectively. - - default_objective (str): - assume max or min for unknown metrics - - Example: - >>> self = ResultAnalysis.demo() - >>> self.analysis() - - Example: - >>> # Given a list of experiments, configs, and results - >>> # Create a ResultAnalysis object - >>> results = ResultAnalysis([ - >>> Result('expt0', {'param1': 2, 'param3': 'b'}, {'f1': 0.75}), - >>> Result('expt1', {'param1': 0, 'param3': 'c'}, {'f1': 0.92}), - >>> Result('expt2', {'param1': 1, 'param3': 'b'}, {'f1': 0.77}), - >>> Result('expt3', {'param1': 1, 'param3': 'a'}, {'f1': 0.67}), - >>> Result('expt4', {'param1': 0, 'param3': 'c'}, {'f1': 0.98}), - >>> Result('expt5', {'param1': 2, 'param3': 'a'}, {'f1': 0.86}), - >>> Result('expt6', {'param1': 1, 'param3': 'c'}, {'f1': 0.77}), - >>> Result('expt7', {'param1': 1, 'param3': 'c'}, {'f1': 0.41}), - >>> Result('expt8', {'param1': 1, 'param3': 'a'}, {'f1': 0.64}), - >>> Result('expt9', {'param1': 0, 'param3': 'b'}, {'f1': 0.95}), - >>> ]) - >>> # Calling the analysis method prints something like the following - >>> results.analysis() - - PARAMETER 'param1' - f1 - ======================= - f1 mean std max min num best - param1 - 0 0.950 0.030000 0.98 0.92 3.0 0.98 - 2 0.805 0.077782 0.86 0.75 2.0 0.86 - 1 0.652 0.147377 0.77 0.41 5.0 0.77 - - ANOVA hypothesis (roughly): the param 'param1' has no effect on the metric - Reject this hypothesis if the p value is less than a threshold - Rank-ANOVA: p=0.0397 - Mean-ANOVA: p=0.0277 - - Pairwise T-Tests - Is param1=0 about as good as param1=2? - ttest_ind: p=0.2058 - Is param1=1 about as good as param1=2? - ttest_ind: p=0.1508 - - - PARAMETER 'param3' - f1 - ======================= - f1 mean std max min num best - param3 - c 0.770000 0.255734 0.98 0.41 4.0 0.98 - b 0.823333 0.110151 0.95 0.75 3.0 0.95 - a 0.723333 0.119304 0.86 0.64 3.0 0.86 - - ANOVA hypothesis (roughly): the param 'param3' has no effect on the metric - Reject this hypothesis if the p value is less than a threshold - Rank-ANOVA: p=0.5890 - Mean-ANOVA: p=0.8145 - - Pairwise T-Tests - Is param3=b about as good as param3=c? - ttest_ind: p=0.7266 - Is param3=a about as good as param3=b? - ttest_ind: p=0.3466 - ttest_rel: p=0.3466 - Is param3=a about as good as param3=c? - ttest_ind: p=0.7626 - """ - - def __init__(self, results, metrics=None, ignore_params=None, - ignore_metrics=None, metric_objectives=None, - abalation_orders={1}, default_objective='max'): - self.results = results - if ignore_metrics is None: - ignore_metrics = set() - if ignore_params is None: - ignore_params = set() - self.ignore_params = ignore_params - self.ignore_metrics = ignore_metrics - - self.abalation_orders = abalation_orders - self.default_objective = default_objective - - # encode if we want to maximize or minimize a metric - default_metric_to_objective = { - 'ap': 'max', - 'acc': 'max', - 'f1': 'max', - # - 'loss': 'min', - 'brier': 'min', - } - if metric_objectives is None: - metric_objectives = {} - - self.metric_objectives = default_metric_to_objective.copy() - self.metric_objectives.update(metric_objectives) - - self.metrics = metrics - self.statistics = None - - self._description = {} - self._description['built'] = False - self._description['num_results'] = len(self.results) - - def __nice__(self): - # if len(self._description) == 0: - # return 'unbuilt' - # else: - return ub.repr2(self._description, si=1, sv=1) - - @classmethod - def demo(cls, num=10, rng=None): - rng = kwarray.ensure_rng(rng) - results = [Result.demo(rng=rng) for _ in range(num)] - self = cls(results, metrics={'f1', 'acc'}) - return self - - def run(self): - self.build() - self.report() - - def analysis(self): - # alias for run - return self.run() - self.build() - self.report() - - @ub.memoize_property - def table(self): - rows = [r.to_dict() for r in self.results] - table = pd.DataFrame(rows) - return table - - def metric_table(self): - rows = [r.to_dict() for r in self.results] - table = pd.DataFrame(rows) - return table - - @ub.memoize_property - def varied(self): - config_rows = [r.params for r in self.results] - sentinel = object() - # pd.DataFrame(config_rows).channels - varied = dict(ub.varied_values(config_rows, default=sentinel, min_variations=1)) - # remove nans - varied = { - k: {v for v in vs if not (isinstance(v, float) and math.isnan(v))} - for k, vs in varied.items()} - varied = {k: vs for k, vs in varied.items() if len(vs)} - return varied - - def abalation_groups(self, param): - """ - Example: - >>> self = ResultAnalysis.demo() - >>> param = 'param2' - >>> self.abalation_groups(param) - """ - table = self.table - config_rows = [r.params for r in self.results] - config_keys = list(map(set, config_rows)) - if self.ignore_params: - config_keys = [c - self.ignore_params for c in config_keys] - isect_params = set.intersection(*config_keys) - other_params = sorted(isect_params - {param}) - groups = [] - for key, group in table.groupby(other_params, dropna=False): - if len(group) > 1: - groups.append(group) - return groups - - def abalate_one(self, param): - """ - Example: - >>> self = ResultAnalysis.demo() - >>> param = 'param2' - >>> # xdoctest: +REQUIRES(module:openskill) - >>> self.abalate_one(param) - """ - import itertools as it - if self.table is None: - self.table = self.build_table() - param_unique_vals = self.table[param].unique().tolist() - score_improvements = ub.ddict(list) - scored_obs = [] - skillboard = SkillTracker(param_unique_vals) - groups = self.abalation_groups(param) - - for group in groups: - for metric_key in self.metrics: - ascending = self._objective_is_ascending(metric_key) - - group = group.sort_values(metric_key, ascending=ascending) - subgroups = group.groupby(param) - if ascending: - best_idx = subgroups[metric_key].idxmax() - else: - best_idx = subgroups[metric_key].idxmin() - best_group = group.loc[best_idx] - best_group = best_group.sort_values(metric_key, ascending=ascending) - - for x1, x2 in it.product(best_group.index, best_group.index): - if x1 != x2: - r1 = best_group.loc[x1] - r2 = best_group.loc[x2] - k1 = r1[param] - k2 = r2[param] - diff = r1[metric_key] - r2[metric_key] - score_improvements[(k1, k2)].append(diff) - - # metric_vals = best_group[metric_key].values - # diffs = metric_vals[None, :] - metric_vals[:, None] - best_group.set_index(param) - # best_group[param] - # best_group[metric_key].diff() - scored_ranking = best_group[[param, metric_key]].reset_index(drop=True) - scored_obs.append(scored_ranking) - skillboard.observe(scored_ranking[param]) - - print('skillboard.ratings = {}'.format(ub.repr2(skillboard.ratings, nl=1, align=':'))) - win_probs = skillboard.predict_win() - print(f'win_probs={win_probs}') - for key, improves in score_improvements.items(): - k1, k2 = key - improves = np.array(improves) - pos_delta = improves[improves > 0] - print(f'\nWhen {param}={k1} is better than {param}={k2}') - print(pd.DataFrame([pd.Series(pos_delta).describe().T])) - return scored_obs - # self.varied[param] - - def _objective_is_ascending(self, metric_key): - """ - Return True if we should minimize the objective (lower is better) - Return False if we should maximize the objective (higher is better) - """ - objective = self.metric_objectives.get(metric_key, None) - if objective is None: - warnings.warn(f'warning assume {self.default_objective} for {metric_key=}') - objective = self.default_objective - ascending = (objective == 'min') - return ascending - - def test_group(self, param_group, metric_key): - """ - Get stats for a particular metric / constant group - - Args: - param_group (List[str]): group of parameters to hold constant. - metric_key (str): The metric to test. - - Returns: - dict - # TODO : document these stats clearly and accurately - - Example: - >>> self = ResultAnalysis.demo(num=30) - >>> print(self.table) - >>> param_group = ['param2'] - >>> metric_key = 'f1' - >>> stats_row = self.test_group(param_group, metric_key) - >>> print('stats_row = {}'.format(ub.repr2(stats_row, nl=2, precision=2))) - >>> # --- - >>> self.build() - >>> self.report() - """ - param_group_name = ','.join(param_group) - stats_row = { - 'param_name': param_group_name, - 'metric': metric_key, - } - # param_values = varied[param_name] - # stats_row['param_values'] = param_values - ascending = self._objective_is_ascending(metric_key) - - # Find all items with this particular param value - value_to_metric_group = {} - value_to_metric_stats = {} - value_to_metric = {} - - varied_cols = sorted(self.varied.keys()) - - # Not sure if this is the right name, these are the other param keys - # that we are not directly investigating, but might have an impact. - # We use these to select comparable rows for pairwise t-tests - nuisance_cols = sorted(set(self.varied.keys()) - set(param_group)) - - for param_value, group in self.table.groupby(param_group): - metric_group = group[['name', metric_key] + varied_cols] - metric_vals = metric_group[metric_key] - metric_vals = metric_vals.dropna() - if len(metric_vals) > 0: - metric_stats = metric_vals.describe() - value_to_metric_stats[param_value] = metric_stats - value_to_metric_group[param_value] = metric_group - value_to_metric[param_value] = metric_vals.values - - moments = pd.DataFrame(value_to_metric_stats).T - moments = moments.sort_values('mean', ascending=ascending) - moments.index.name = param_group_name - moments.columns.name = metric_key - ranking = moments['mean'].index.to_list() - param_to_rank = ub.invert_dict(dict(enumerate(ranking))) - - # Determine a set of value pairs to do pairwise comparisons on - value_pairs = ub.oset() - value_pairs.update(map(frozenset, ub.iter_window(moments.index, 2))) - value_pairs.update(map(frozenset, ub.iter_window(moments.sort_values('mean', ascending=ascending).index, 2))) - - # https://en.wikipedia.org/wiki/Kruskal%E2%80%93Wallis_one-way_analysis_of_variance - # If the researcher can make the assumptions of an identically - # shaped and scaled distribution for all groups, except for any - # difference in medians, then the null hypothesis is that the - # medians of all groups are equal, and the alternative - # hypothesis is that at least one population median of one - # group is different from the population median of at least one - # other group. - try: - anova_krus_result = scipy.stats.kruskal(*value_to_metric.values()) - except ValueError: - anova_krus_result = scipy.stats.stats.KruskalResult(np.nan, np.nan) - - # https://en.wikipedia.org/wiki/One-way_analysis_of_variance - # The One-Way ANOVA tests the null hypothesis, which states - # that samples in all groups are drawn from populations with - # the same mean values - if len(value_to_metric) > 1: - anova_1way_result = scipy.stats.f_oneway(*value_to_metric.values()) - else: - anova_1way_result = scipy.stats.stats.F_onewayResult(np.nan, np.nan) - - stats_row['anova_rank_H'] = anova_krus_result.statistic - stats_row['anova_rank_p'] = anova_krus_result.pvalue - stats_row['anova_mean_F'] = anova_1way_result.statistic - stats_row['anova_mean_p'] = anova_1way_result.pvalue - stats_row['moments'] = moments - - pairwise_statistics = [] - for pair in value_pairs: - pair_statistics = {} - # try: - # param_val1, param_val2 = sorted(pair) - # except Exception: - # param_val1, param_val2 = (pair) - param_val1, param_val2 = pair - - metric_vals1 = value_to_metric[param_val1] - metric_vals2 = value_to_metric[param_val2] - - rank1 = param_to_rank[param_val1] - rank2 = param_to_rank[param_val2] - pair_statistics['winner'] = param_val1 if rank1 < rank2 else param_val2 - pair_statistics['value1'] = param_val1 - pair_statistics['value2'] = param_val2 - pair_statistics['n1'] = len(metric_vals1) - pair_statistics['n2'] = len(metric_vals2) - ttest_ind_result = scipy.stats.ttest_ind(metric_vals1, metric_vals2, equal_var=False) - pair_statistics['ttest_ind'] = ttest_ind_result - - # Do relative checks, need to find comparable subgroups - metric_group1 = value_to_metric_group[param_val1] - metric_group2 = value_to_metric_group[param_val2] - nuisance_vals1 = metric_group1[nuisance_cols] - nuisance_vals2 = metric_group2[nuisance_cols] - nk_to_group1 = dict(list(nuisance_vals1.groupby(nuisance_cols))) - nk_to_group2 = dict(list(nuisance_vals2.groupby(nuisance_cols))) - common = set(nk_to_group1) & set(nk_to_group2) - comparable_indexes1 = [] - comparable_indexes2 = [] - if common: - for nk in common: - group1 = nk_to_group1[nk] - group2 = nk_to_group2[nk] - for i, j in it.product(group1.index, group2.index): - comparable_indexes1.append(i) - comparable_indexes2.append(j) - - comparable_groups1 = metric_group1.loc[comparable_indexes1, metric_key] - comparable_groups2 = metric_group2.loc[comparable_indexes2, metric_key] - - # Does this need to have the values aligned? - ttest_rel_result = scipy.stats.ttest_rel(comparable_groups1, comparable_groups2) - pair_statistics['ttest_rel'] = ttest_rel_result - pairwise_statistics.append(pair_statistics) - - stats_row['pairwise'] = pairwise_statistics - return stats_row - - def build(self): - import itertools as it - if len(self.results) < 2: - raise Exception('need at least 2 results') - - varied = self.varied.copy() - if self.ignore_params: - for k in self.ignore_params: - varied.pop(k, None) - - # Experimental: - # Find Auto-abalation groups - # TODO: when the group size is -1, instead of showing all of the group - # settings, for each group setting do the k=1 analysis within that group - varied_param_names = list(varied.keys()) - num_varied_params = len(varied) - held_constant_orders = {num_varied_params + i if i < 0 else i for i in self.abalation_orders} - held_constant_orders = [i for i in held_constant_orders if i > 0] - held_constant_groups = [] - for k in held_constant_orders: - held_constant_groups.extend( - list(map(list, it.combinations(varied_param_names, k)))) - - if self.metrics is None: - avail_metrics = set.intersection(*[set(r.metrics.keys()) for r in self.results]) - metrics_of_interest = sorted(avail_metrics - set(self.ignore_metrics)) - else: - metrics_of_interest = self.metrics - self.metrics_of_interest = metrics_of_interest - self._description['metrics_of_interest'] = metrics_of_interest - self._description['num_groups'] = len(held_constant_groups) - - # Analyze the impact of each parameter - self.statistics = statistics = [] - for param_group in held_constant_groups: - for metric_key in metrics_of_interest: - stats_row = self.test_group(param_group, metric_key) - statistics.append(stats_row) - - self.stats_table = pd.DataFrame([ - ub.dict_diff(d, {'pairwise', 'param_values', 'moments'}) - for d in self.statistics]) - - if len(self.stats_table): - self.stats_table = self.stats_table.sort_values('anova_rank_p') - - self._description['built'] = True - - def report(self): - p_threshold = 0.05 - stat_groups = ub.group_items(self.statistics, key=lambda x: x['param_name']) - stat_groups_items = list(stat_groups.items()) - - # Modify this order to change the grouping pattern - grid = ub.named_product({ - 'stat_group_item': stat_groups_items, - 'metrics': self.metrics_of_interest, + dpath = ub.Path.appdir('ujson/benchmark_results').ensuredir() + benchmark.dump_in_dpath(dpath) + + RECORD_ALL = 0 + metric_key = "time" if RECORD_ALL else "mean" + + from benchmarker import result_analysis + results = benchmark.result.to_result_list() + analysis = result_analysis.ResultAnalysis( + results, + metrics=[metric_key], + params=['impl'], + metric_objectives={ + 'min': 'min', + 'mean': 'min', + 'time': 'min', }) - for grid_item in grid: - metric_key = grid_item['metrics'] - stat_groups_item = grid_item['stat_group_item'] + analysis.analysis() - param_name, stat_group = stat_groups_item - stats_row = ub.group_items(stat_group, key=lambda x: x['metric'])[metric_key][0] - title = ('PARAMETER {!r} - {}'.format(param_name, metric_key)) - print('\n\n') - print(title) - print('=' * len(title)) - print(stats_row['moments']) - anova_rank_p = stats_row['anova_rank_p'] - anova_mean_p = stats_row['anova_mean_p'] - # Rougly speaking - print('') - print(f'ANOVA: If p is low, the param {param_name!r} might have an effect') - print(ub.color_text(f' Rank-ANOVA: p={anova_rank_p:0.8f}', 'green' if anova_rank_p < p_threshold else None)) - print(ub.color_text(f' Mean-ANOVA: p={anova_mean_p:0.8f}', 'green' if anova_mean_p < p_threshold else None)) - print('') - print('Pairwise T-Tests') - for pairstat in stats_row['pairwise']: - # Is this backwards? - value1 = pairstat['value1'] - value2 = pairstat['value2'] - winner = pairstat['winner'] - if value2 == winner: - value1, value2 = value2, value1 - print(f' If p is low, {param_name}={value1} may outperform {param_name}={value2}.') - if 'ttest_ind' in pairstat: - ttest_ind_result = pairstat['ttest_ind'] - print(ub.color_text(f' ttest_ind: p={ttest_ind_result.pvalue:0.8f}', 'green' if ttest_ind_result.pvalue < p_threshold else None)) - if 'ttest_rel' in pairstat: - ttest_rel_result = pairstat['ttest_ind'] - print(ub.color_text(f' ttest_rel: p={ttest_rel_result.pvalue:0.8f}', 'green' if ttest_rel_result.pvalue < p_threshold else None)) - - print(self.stats_table) - - def conclusions(self): - conclusions = [] - for stat in self.statistics: - param_name = stat['param_name'] - metric = stat['metric'] - for pairstat in stat['pairwise']: - value1 = pairstat['value1'] - value2 = pairstat['value2'] - winner = pairstat['winner'] - if value2 == winner: - value1, value2 = value2, value1 - pvalue = stat = pairstat['ttest_ind'].pvalue - txt = (f'p={pvalue:0.8f}, If p is low, {param_name}={value1} may outperform {value2} on {metric}.') - conclusions.append(txt) - return conclusions - - -class SkillTracker: - """ - Wrapper around openskill - - Args: - player_ids (List[T]): - a list of ids (usually ints) used to represent each player - - Example: - >>> # xdoctest: +REQUIRES(module:openskill) - >>> self = SkillTracker([1, 2, 3, 4, 5]) - >>> self.observe([2, 3]) # Player 2 beat player 3. - >>> self.observe([1, 2, 5, 3]) # Player 3 didnt play this round. - >>> self.observe([2, 3, 4, 5, 1]) # Everyone played, player 2 won. - >>> win_probs = self.predict_win() - >>> print('win_probs = {}'.format(ub.repr2(win_probs, nl=1, precision=2))) - win_probs = { - 1: 0.20, - 2: 0.21, - 3: 0.19, - 4: 0.20, - 5: 0.20, - } - """ - - def __init__(self, player_ids): - import openskill - self.player_ids = player_ids - self.ratings = {m: openskill.Rating() for m in player_ids} - self.observations = [] - - def predict_win(self): - """ - Estimate the probability that a particular player will win given the - current ratings. - - Returns: - Dict[T, float]: mapping from player ids to win probabilites - """ - from openskill import predict_win - teams = [[p] for p in list(self.ratings.keys())] - ratings = [[r] for r in self.ratings.values()] - probs = predict_win(ratings) - win_probs = {team[0]: prob for team, prob in zip(teams, probs)} - return win_probs - - def observe(self, ranking): - """ - After simulating a round, pass the ranked order of who won - (winner is first, looser is last) to this function. And it - updates the rankings. - - Args: - ranking (List[T]): - ranking of all the players that played in this round - winners are at the front (0-th place) of the list. - """ - import openskill - self.observations.append(ranking) - ratings = self.ratings - team_standings = [[r] for r in ub.take(ratings, ranking)] - new_values = openskill.rate(team_standings) # Not inplace - new_ratings = [openskill.Rating(*new[0]) for new in new_values] - ratings.update(ub.dzip(ranking, new_ratings)) + # benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL) if __name__ == "__main__": diff --git a/tests/benchmarker/__init__.py b/tests/benchmarker/__init__.py new file mode 100644 index 0000000..1d04095 --- /dev/null +++ b/tests/benchmarker/__init__.py @@ -0,0 +1,35 @@ +""" +A helper module for executing, serializing, combining, and comparing benchmarks +""" + +__mkinit__ = """ +# Autogenerate this file +mkinit ~/code/ultrajson/tests/benchmarker/__init__.py -w +""" + +__version__ = '0.1.0' + +from benchmarker import aggregate +from benchmarker import benchmarker +from benchmarker import process_context +from benchmarker import result_analysis +from benchmarker import util_json +from benchmarker import visualize + +from benchmarker.aggregate import (demo, demo_data,) +from benchmarker.benchmarker import (Benchmarker, BenchmarkerConfig, + BenchmarkerResult, combine_stats, + stats_dict,) +from benchmarker.process_context import (ProcessContext,) +from benchmarker.result_analysis import (Result, ResultAnalysis, SkillTracker,) +from benchmarker.util_json import (ensure_json_serializable, + find_json_unserializable, + indexable_allclose,) +from benchmarker.visualize import (benchmark_analysis,) + +__all__ = ['Benchmarker', 'BenchmarkerConfig', 'BenchmarkerResult', + 'ProcessContext', 'Result', 'ResultAnalysis', 'SkillTracker', + 'aggregate', 'benchmark_analysis', 'benchmarker', 'combine_stats', + 'demo', 'demo_data', 'ensure_json_serializable', + 'find_json_unserializable', 'indexable_allclose', 'process_context', + 'result_analysis', 'stats_dict', 'util_json', 'visualize'] diff --git a/tests/benchmarker/_test_ttest.py b/tests/benchmarker/_test_ttest.py new file mode 100644 index 0000000..4e83a5d --- /dev/null +++ b/tests/benchmarker/_test_ttest.py @@ -0,0 +1,28 @@ + +def check_ttest(): + import scipy + import scipy.stats # NOQA + from benchmarker.benchmarker import stats_dict + import numpy as np + metric_vals1 = np.random.randn(10000) + 0.01 + metric_vals2 = np.random.randn(1000) + + stats1 = stats_dict(metric_vals1) + stats2 = stats_dict(metric_vals2) + + ind_kw = dict( + equal_var=0, + # alternative='two-sided' + alternative='less' if stats1['mean'] < stats2['mean'] else 'greater' + ) + + # Not sure why these are slightly different + res1 = scipy.stats.ttest_ind(metric_vals1, metric_vals2, **ind_kw) + + res2 = scipy.stats.ttest_ind_from_stats( + stats1['mean'], stats1['std'], stats1['n'], + stats2['mean'], stats2['std'], stats2['n'], + **ind_kw + ) + print('res1 = {!r}'.format(res1)) + print('res2 = {!r}'.format(res2)) diff --git a/tests/benchmarker/aggregate.py b/tests/benchmarker/aggregate.py new file mode 100644 index 0000000..41d11e8 --- /dev/null +++ b/tests/benchmarker/aggregate.py @@ -0,0 +1,68 @@ +import json +import pandas as pd +import ubelt as ub + + +def demo_data(): + from benchmarker.benchmarker import Benchmarker + import numpy as np + impl_lut = { + 'numpy': np.sum, + 'builtin': sum, + } + def data_lut(params): + item = 42 if params['dtype'] == 'int' else 42.0 + data = [item] * params['size'] + return data + basis = { + 'impl': ['builtin', 'numpy'], + 'size': [10, 10000], + 'dtype': ['int', 'float'], + } + + dpath = ub.Path.appdir('benchmarker/agg_demo').delete().ensuredir() + + def run_one_benchmark(): + self = Benchmarker(name='agg_demo', num=10, bestof=3, basis=basis) + for params in self.iter_params(): + impl = impl_lut[params['impl']] + data = data_lut(params) + for timer in self.measure(): + with timer: + impl(data) + fpath = self.dump_in_dpath(dpath) + return fpath + + # Run the benchmark multiple times + fpaths = [] + for _ in range(5): + fpath = run_one_benchmark() + fpaths.append(fpath) + + return fpaths + + +def demo(): + from benchmarker import BenchmarkerResult + from benchmarker import result_analysis + fpaths = demo_data() + + results = [] + for fpath in fpaths: + data = json.loads(fpath.read_text()) + for row in data['rows']: + result = BenchmarkerResult.load(fpath) + results.extend(result.to_result_list()) + + analysis = result_analysis.ResultAnalysis( + results, + metrics=['min', 'mean'], + params=['impl'], + metric_objectives={ + 'min': 'min', + 'mean': 'min', + }) + analysis.analysis() + # single_df = pd.DataFrame(data['rows']) + # context = data['context'] + # single_df diff --git a/tests/benchmarker/benchmarker.py b/tests/benchmarker/benchmarker.py new file mode 100644 index 0000000..b488fb3 --- /dev/null +++ b/tests/benchmarker/benchmarker.py @@ -0,0 +1,230 @@ +import json +import timerit +import ubelt as ub +import numpy as np +from dataclasses import dataclass +from benchmarker.process_context import ProcessContext + + +@dataclass +class BenchmarkerConfig: + name : str = None + num : int = 100 + bestof : int = 10 + + +class BenchmarkerResult: + """ + Serialization for a single benchmark result + """ + def __init__(self, context, rows): + self.context = context + self.rows = rows + + def __json__(self): + data = { + 'type': 'benchmark_result', + 'context': self.context, + 'rows': self.rows, + } + return data + + @classmethod + def from_json(cls, data): + assert data['type'] == 'benchmark_result' + self = cls(data['context'], data['rows']) + return self + + @classmethod + def load(cls, fpath): + with open(fpath, 'r') as file: + data = json.load(file) + self = cls.from_json(data) + return self + + def to_result_list(self): + """ + Returns a list of result objects suitable for ResultAnalysis + + Returns: + List[Result] + """ + from benchmarker import result_analysis + results = [] + for row in self.rows: + result = result_analysis.Result( + name=row['name'], + metrics=row['metrics'], + params=row['params'].copy(), + ) + machine = self.context['machine'] + assert not ub.dict_isect(result.params, machine) + result.params.update(machine) + results.append(result) + return results + + +class Benchmarker: + """ + Helper to organize the execution and serialization of a benchmark + + Example: + >>> import sys, ubelt + >>> sys.path.append(ubelt.expandpath('~/code/ultrajson/tests')) + >>> from benchmarker.benchmarker import * # NOQA + >>> import numpy as np + >>> impl_lut = { + >>> 'numpy': np.sum, + >>> 'builtin': sum, + >>> } + >>> def data_lut(params): + >>> item = 42 if params['dtype'] == 'int' else 42.0 + >>> data = [item] * params['size'] + >>> return data + >>> basis = { + >>> 'impl': ['builtin', 'numpy'], + >>> 'size': [10, 10000], + >>> 'dtype': ['int', 'float'], + >>> } + >>> self = Benchmarker(name='demo', num=10, bestof=3, basis=basis) + >>> for params in self.iter_params(): + >>> impl = impl_lut[params['impl']] + >>> data = data_lut(params) + >>> for timer in self.measure(): + >>> with timer: + >>> impl(data) + >>> print('self.result = {}'.format(ub.repr2(self.result.__json__(), sort=0, nl=2, precision=8))) + >>> dpath = ub.Path.appdir('benchmarker/demo').ensuredir() + >>> self.dump_in_dpath(dpath) + """ + def __init__(self, basis={}, **kwargs): + self.basis = basis + + self.config = BenchmarkerConfig(**kwargs) + + self.ti = timerit.Timerit( + num=self.config.num, + bestof=self.config.bestof) + self.context = ProcessContext(name=self.config.name) + self.rows = [] + self.RECORD_ALL = 0 + self.result = None + + def dump_in_dpath(self, dpath): + dpath = ub.Path(dpath) + timestamp = self.context.obj['stop_timestamp'] + fname = f'benchmarks_{self.config.name}_{timestamp}.json' + fpath = dpath / fname + + with open(fpath, 'w') as file: + json.dump(self.result.__json__(), file) + return fpath + + def iter_params(self): + self.context.start() + grid_iter = list(ub.named_product(self.basis)) + for params in grid_iter: + self.params = params + self.key = ub.repr2(params, compact=1, si=1) + yield params + obj = self.context.stop() + self.result = BenchmarkerResult(obj, self.rows) + + def measure(self): + for timer in self.ti.reset(self.key): + yield timer + + rows = self.rows + ti = self.ti + key = self.key + params = self.params + times = ti.robust_times() + if self.RECORD_ALL: + for time in times: + metrics = { + "time": time, + } + row = { + 'name': key, + 'metrics': metrics, + 'params': params, + } + rows.append(row) + else: + times = np.array(ti.robust_times()) + metrics = stats_dict(times) + row = { + 'metrics': metrics, + 'params': params, + 'name': key, + } + rows.append(row) + + +def stats_dict(data): + stats = { + 'n': len(data), + 'mean': data.mean(), + 'std': data.std(), + 'min': data.min(), + 'max': data.max(), + } + return stats + + +def combine_stats(s1, s2): + """ + Helper for combining mean and standard deviation of multiple measurements + + Args: + s1 (dict): stats dict containing mean, std, and n + s2 (dict): stats dict containing mean, std, and n + + Example: + >>> basis = { + >>> 'n1': [1, 10, 100, 10000], + >>> 'n2': [1, 10, 100, 10000], + >>> } + >>> for params in ub.named_product(basis): + >>> data1 = np.random.rand(params['n1']) + >>> data2 = np.random.rand(params['n2']) + >>> data3 = np.hstack([data1, data2]) + >>> s1 = stats_dict(data1) + >>> s2 = stats_dict(data2) + >>> s3 = stats_dict(data3) + >>> # Check that our combo works + >>> combo_s3 = combine_stats(s1, s2) + >>> compare = pd.DataFrame({'raw': s3, 'combo': combo_s3}) + >>> print(compare) + >>> assert np.allclose(compare.raw, compare.combo) + + References: + https://stackoverflow.com/questions/7753002/adding-combining-standard-deviations + https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups + """ + stats = [s1, s2] + sizes = np.array([s['n'] for s in stats]) + means = np.array([s['mean'] for s in stats]) + stds = np.array([s['std'] for s in stats]) + mins = np.array([s['min'] for s in stats]) + maxs = np.array([s['max'] for s in stats]) + varis = stds * stds + + combo_size = sizes.sum() + combo_mean = (sizes * means).sum() / combo_size + + mean_deltas = (means - combo_mean) + + sv = (sizes * varis).sum() + sm = (sizes * (mean_deltas * mean_deltas)).sum() + combo_vars = (sv + sm) / combo_size + combo_std = np.sqrt(combo_vars) + + combo_stats = { + 'n': combo_size, + 'mean': combo_mean, + 'std': combo_std, + 'min': mins.min(), + 'max': maxs.max(), + } + return combo_stats diff --git a/tests/benchmarker/process_context.py b/tests/benchmarker/process_context.py new file mode 100644 index 0000000..e198f9c --- /dev/null +++ b/tests/benchmarker/process_context.py @@ -0,0 +1,103 @@ +import ubelt as ub +import socket +import platform +import sys + + +class ProcessContext: + """ + Context manager to track the context under which a result was computed + + Example: + >>> import sys, ubelt + >>> sys.path.append(ubelt.expandpath('~/code/ultrajson/tests')) + >>> from benchmarker.process_context import * # NOQA + >>> self = ProcessContext() + >>> obj = self.start().stop() + """ + + def __init__(self, name=None, args=None, config=None): + if args is None: + args = sys.argv + + self.obj = { + 'type': 'process_context', + 'name': name, + 'args': args, + 'config': config, + 'machine': None, + 'start_timestamp': None, + 'stop_timestamp': None, + } + + def _timestamp(self): + import datetime + timestamp = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat() + timestamp = timestamp.replace(':', '') + # timestamp = ub.timestamp() + return timestamp + + def _hostinfo(self): + return { + 'host': socket.gethostname(), + 'user': ub.Path(ub.userhome()).name, + # 'cwd': os.getcwd(), + } + + def _osinfo(self): + uname_system, _, uname_release, uname_version, _, uname_processor = platform.uname() + return { + 'os_name': uname_system, + 'os_release': uname_release, + 'os_version': uname_version, + 'arch': uname_processor, + } + + def _pyinfo(self): + return { + 'py_impl': platform.python_implementation(), + 'py_version': sys.version.replace("\n", ""), + } + + def _meminfo(self): + import psutil + # TODO: could collect memory info at start and stop and intermediate + # stages. Here we just want info that is static wrt to the machine. + # For now, just get the total available. + svmem_info = psutil.virtual_memory() + return { + 'mem_total': svmem_info.total, + } + + # def _cpuinfo(self): + # import cpuinfo + # cpu_info = cpuinfo.get_cpu_info() + # return cpu_info + + def _machine(self): + return ub.dict_union( + self._hostinfo(), + self._meminfo(), + self._osinfo(), + self._pyinfo(), + ) + + def start(self): + self.obj.update({ + 'machine': self._machine(), + 'start_timestamp': self._timestamp(), + 'stop_timestamp': None, + }) + return self + + def stop(self): + self.obj.update({ + 'stop_timestamp': self._timestamp(), + }) + return self.obj + + def __enter__(self): + return self.start() + + def __exit__(self, a, b, c): + self.stop() diff --git a/tests/benchmarker/result_analysis.py b/tests/benchmarker/result_analysis.py new file mode 100644 index 0000000..1067b3e --- /dev/null +++ b/tests/benchmarker/result_analysis.py @@ -0,0 +1,722 @@ +import itertools as it +import math +import numpy as np +import pandas as pd +import ubelt as ub +import warnings +import scipy +import scipy.stats # NOQA + + +class Result(ub.NiceRepr): + """ + Storage of names, parameters, and quality metrics for a single experiment. + + Attributes: + name (str | None): + Name of the experiment. Optional. This is unused in the analysis. + (i.e. names will never be used computationally. Use them for keys) + + params (Dict[str, object]): configuration of the experiment. + This is a dictionary mapping a parameter name to its value. + + metrics (Dict[str, float]): quantitative results of the experiment + This is a dictionary for each quality metric computed on this + result. + + meta (Dict | None): any other metadata about this result. + This is unused in the analysis. + + Example: + >>> self = Result.demo(rng=32) + >>> print('self = {}'.format(self)) + self = + """ + def __init__(self, name, params, metrics, meta=None): + self.name = name + self.params = params + self.metrics = metrics + self.meta = meta + + def to_dict(self): + row = ub.dict_union({'name': self.name}, self.metrics, self.params) + return row + + def __nice__(self): + row = self.to_dict() + text = ub.repr2(row, compact=True, precision=2, sort=0) + return text + + @classmethod + def demo(cls, rng=None): + import numpy as np + import string + import kwarray + rng = kwarray.ensure_rng(rng) + demo_param_space = { + 'param1': list(range(3)), + 'param2': np.linspace(0, 10, 10), + 'param3': list(string.ascii_lowercase[0:3]), + } + params = {k: rng.choice(b) for k, b in demo_param_space.items()} + metrics = { + 'f1': rng.rand(), + 'acc': rng.rand(), + } + name = ub.hash_data(params)[0:8] + self = cls(name, params, metrics) + return self + + +class ResultAnalysis(ub.NiceRepr): + """ + Groups and runs stats on results + + Runs statistical tests on sets of configuration-metrics pairs + + Attributes: + results (List[Result]): list of results + + ignore_metrics (Set[str]): metrics to ignore + + ignore_params (Set[str]): parameters to ignore + + metric_objectives (Dict[str, str]): + indicate if each metrix should be maximized "max" or minimized + "min" + + metrics (List[str]): + only consider these metrics + + params (List[str]): + if given, only consider these params + + abalation_orders (Set[int]): + The number of parameters to be held constant in each statistical + grouping. Defaults to 1, so it groups together results where 1 + variable is held constant. Including 2 will include pairwise + settings of parameters to be held constant. Using -1 or -2 means + all but 1 or 2 parameters will be held constant, repsectively. + + default_objective (str): + assume max or min for unknown metrics + + Example: + >>> self = ResultAnalysis.demo() + >>> self.analysis() + + Example: + >>> # Given a list of experiments, configs, and results + >>> # Create a ResultAnalysis object + >>> results = ResultAnalysis([ + >>> Result('expt0', {'param1': 2, 'param3': 'b'}, {'f1': 0.75}), + >>> Result('expt1', {'param1': 0, 'param3': 'c'}, {'f1': 0.92}), + >>> Result('expt2', {'param1': 1, 'param3': 'b'}, {'f1': 0.77}), + >>> Result('expt3', {'param1': 1, 'param3': 'a'}, {'f1': 0.67}), + >>> Result('expt4', {'param1': 0, 'param3': 'c'}, {'f1': 0.98}), + >>> Result('expt5', {'param1': 2, 'param3': 'a'}, {'f1': 0.86}), + >>> Result('expt6', {'param1': 1, 'param3': 'c'}, {'f1': 0.77}), + >>> Result('expt7', {'param1': 1, 'param3': 'c'}, {'f1': 0.41}), + >>> Result('expt8', {'param1': 1, 'param3': 'a'}, {'f1': 0.64}), + >>> Result('expt9', {'param1': 0, 'param3': 'b'}, {'f1': 0.95}), + >>> ]) + >>> # Calling the analysis method prints something like the following + >>> results.analysis() + + PARAMETER 'param1' - f1 + ======================= + f1 mean std max min num best + param1 + 0 0.950 0.030000 0.98 0.92 3.0 0.98 + 2 0.805 0.077782 0.86 0.75 2.0 0.86 + 1 0.652 0.147377 0.77 0.41 5.0 0.77 + + ANOVA hypothesis (roughly): the param 'param1' has no effect on the metric + Reject this hypothesis if the p value is less than a threshold + Rank-ANOVA: p=0.0397 + Mean-ANOVA: p=0.0277 + + Pairwise T-Tests + Is param1=0 about as good as param1=2? + ttest_ind: p=0.2058 + Is param1=1 about as good as param1=2? + ttest_ind: p=0.1508 + + + PARAMETER 'param3' - f1 + ======================= + f1 mean std max min num best + param3 + c 0.770000 0.255734 0.98 0.41 4.0 0.98 + b 0.823333 0.110151 0.95 0.75 3.0 0.95 + a 0.723333 0.119304 0.86 0.64 3.0 0.86 + + ANOVA hypothesis (roughly): the param 'param3' has no effect on the metric + Reject this hypothesis if the p value is less than a threshold + Rank-ANOVA: p=0.5890 + Mean-ANOVA: p=0.8145 + + Pairwise T-Tests + Is param3=b about as good as param3=c? + ttest_ind: p=0.7266 + Is param3=a about as good as param3=b? + ttest_ind: p=0.3466 + ttest_rel: p=0.3466 + Is param3=a about as good as param3=c? + ttest_ind: p=0.7626 + """ + + def __init__(self, results, metrics=None, params=None, ignore_params=None, + ignore_metrics=None, metric_objectives=None, + abalation_orders={1}, default_objective='max'): + self.results = results + if ignore_metrics is None: + ignore_metrics = set() + if ignore_params is None: + ignore_params = set() + self.ignore_params = ignore_params + self.ignore_metrics = ignore_metrics + + self.abalation_orders = abalation_orders + self.default_objective = default_objective + + # encode if we want to maximize or minimize a metric + default_metric_to_objective = { + 'ap': 'max', + 'acc': 'max', + 'f1': 'max', + # + 'loss': 'min', + 'brier': 'min', + } + if metric_objectives is None: + metric_objectives = {} + + self.metric_objectives = default_metric_to_objective.copy() + self.metric_objectives.update(metric_objectives) + + self.params = params + self.metrics = metrics + self.statistics = None + + self._description = {} + self._description['built'] = False + self._description['num_results'] = len(self.results) + + def __nice__(self): + # if len(self._description) == 0: + # return 'unbuilt' + # else: + return ub.repr2(self._description, si=1, sv=1) + + @classmethod + def demo(cls, num=10, rng=None): + import kwarray + rng = kwarray.ensure_rng(rng) + results = [Result.demo(rng=rng) for _ in range(num)] + self = cls(results, metrics={'f1', 'acc'}) + return self + + def run(self): + self.build() + self.report() + + def analysis(self): + # alias for run + return self.run() + self.build() + self.report() + + @ub.memoize_property + def table(self): + rows = [r.to_dict() for r in self.results] + table = pd.DataFrame(rows) + return table + + def metric_table(self): + rows = [r.to_dict() for r in self.results] + table = pd.DataFrame(rows) + return table + + @ub.memoize_property + def varied(self): + config_rows = [r.params for r in self.results] + sentinel = object() + # pd.DataFrame(config_rows).channels + varied = dict(ub.varied_values(config_rows, default=sentinel, min_variations=1)) + # remove nans + varied = { + k: {v for v in vs if not (isinstance(v, float) and math.isnan(v))} + for k, vs in varied.items()} + varied = {k: vs for k, vs in varied.items() if len(vs)} + return varied + + def abalation_groups(self, param): + """ + Return groups where the specified parameter(s) are varied, but all + other non-ignored parameters are held the same. + + Example: + >>> self = ResultAnalysis.demo() + >>> param = 'param2' + >>> self.abalation_groups(param) + """ + if not ub.iterable(param): + param = [param] + table = self.table + config_rows = [r.params for r in self.results] + config_keys = list(map(set, config_rows)) + # if self.params: + # config_keys = list(self.params) + if self.ignore_params: + config_keys = [c - self.ignore_params for c in config_keys] + isect_params = set.intersection(*config_keys) + other_params = sorted(isect_params - set(param)) + groups = [] + for key, group in table.groupby(other_params, dropna=False): + if len(group) > 1: + groups.append(group) + return groups + + def abalate(self, param): + """ + Example: + >>> self = ResultAnalysis.demo(100) + >>> param = 'param2' + >>> # xdoctest: +REQUIRES(module:openskill) + >>> self.abalate(param) + + >>> self = ResultAnalysis.demo() + >>> param = ['param2', 'param3'] + >>> self.abalate(param) + """ + import itertools as it + if self.table is None: + self.table = self.build_table() + if not ub.iterable(param): + param = [param] + + # For hashable generic dictionary + from collections import namedtuple + gd = namedtuple('config', param) + + # from types import SimpleNamespace + param_unique_vals_ = self.table[param].drop_duplicates().to_dict('records') + param_unique_vals = [gd(**d) for d in param_unique_vals_] + # param_unique_vals = {p: self.table[p].unique().tolist() for p in param} + score_improvements = ub.ddict(list) + scored_obs = [] + skillboard = SkillTracker(param_unique_vals) + groups = self.abalation_groups(param) + + for group in groups: + for metric_key in self.metrics: + ascending = self._objective_is_ascending(metric_key) + + group = group.sort_values(metric_key, ascending=ascending) + subgroups = group.groupby(param) + if ascending: + best_idx = subgroups[metric_key].idxmax() + else: + best_idx = subgroups[metric_key].idxmin() + best_group = group.loc[best_idx] + best_group = best_group.sort_values(metric_key, ascending=ascending) + + for x1, x2 in it.product(best_group.index, best_group.index): + if x1 != x2: + r1 = best_group.loc[x1] + r2 = best_group.loc[x2] + k1 = gd(**r1[param]) + k2 = gd(**r2[param]) + diff = r1[metric_key] - r2[metric_key] + score_improvements[(k1, k2, metric_key)].append(diff) + + # metric_vals = best_group[metric_key].values + # diffs = metric_vals[None, :] - metric_vals[:, None] + best_group.set_index(param) + # best_group[param] + # best_group[metric_key].diff() + scored_ranking = best_group[param + [metric_key]].reset_index(drop=True) + scored_obs.append(scored_ranking) + ranking = [gd(**d) for d in scored_ranking[param].to_dict('records')] + skillboard.observe(ranking) + + print('skillboard.ratings = {}'.format(ub.repr2(skillboard.ratings, nl=1, align=':'))) + win_probs = skillboard.predict_win() + print('win_probs = {}'.format(ub.repr2(win_probs, nl=1))) + for key, improves in score_improvements.items(): + k1, k2, metric_key = key + improves = np.array(improves) + pos_delta = improves[improves > 0] + print(f'\nWhen {k1} is better than {k2}, the improvement in {metric_key} is') + print(pd.DataFrame([pd.Series(pos_delta).describe().T])) + return scored_obs + + def _objective_is_ascending(self, metric_key): + """ + Return True if we should minimize the objective (lower is better) + Return False if we should maximize the objective (higher is better) + """ + objective = self.metric_objectives.get(metric_key, None) + if objective is None: + warnings.warn(f'warning assume {self.default_objective} for {metric_key=}') + objective = self.default_objective + ascending = (objective == 'min') + return ascending + + def test_group(self, param_group, metric_key): + """ + Get stats for a particular metric / constant group + + Args: + param_group (List[str]): group of parameters to hold constant. + metric_key (str): The metric to test. + + Returns: + dict + # TODO : document these stats clearly and accurately + + Example: + >>> self = ResultAnalysis.demo(num=30) + >>> print(self.table) + >>> param_group = ['param2'] + >>> metric_key = 'f1' + >>> stats_row = self.test_group(param_group, metric_key) + >>> print('stats_row = {}'.format(ub.repr2(stats_row, nl=2, precision=2))) + >>> # --- + >>> self.build() + >>> self.report() + """ + param_group_name = ','.join(param_group) + stats_row = { + 'param_name': param_group_name, + 'metric': metric_key, + } + # param_values = varied[param_name] + # stats_row['param_values'] = param_values + ascending = self._objective_is_ascending(metric_key) + + # Find all items with this particular param value + value_to_metric_group = {} + value_to_metric_stats = {} + value_to_metric = {} + + varied_cols = sorted(self.varied.keys()) + + # Not sure if this is the right name, these are the other param keys + # that we are not directly investigating, but might have an impact. + # We use these to select comparable rows for pairwise t-tests + nuisance_cols = sorted(set(self.varied.keys()) - set(param_group)) + + for param_value, group in self.table.groupby(param_group): + metric_group = group[['name', metric_key] + varied_cols] + metric_vals = metric_group[metric_key] + metric_vals = metric_vals.dropna() + if len(metric_vals) > 0: + metric_stats = metric_vals.describe() + value_to_metric_stats[param_value] = metric_stats + value_to_metric_group[param_value] = metric_group + value_to_metric[param_value] = metric_vals.values + + moments = pd.DataFrame(value_to_metric_stats).T + moments = moments.sort_values('mean', ascending=ascending) + moments.index.name = param_group_name + moments.columns.name = metric_key + ranking = moments['mean'].index.to_list() + param_to_rank = ub.invert_dict(dict(enumerate(ranking))) + + # Determine a set of value pairs to do pairwise comparisons on + value_pairs = ub.oset() + value_pairs.update(map(frozenset, ub.iter_window(moments.index, 2))) + value_pairs.update(map(frozenset, ub.iter_window(moments.sort_values('mean', ascending=ascending).index, 2))) + + # https://en.wikipedia.org/wiki/Kruskal%E2%80%93Wallis_one-way_analysis_of_variance + # If the researcher can make the assumptions of an identically + # shaped and scaled distribution for all groups, except for any + # difference in medians, then the null hypothesis is that the + # medians of all groups are equal, and the alternative + # hypothesis is that at least one population median of one + # group is different from the population median of at least one + # other group. + try: + anova_krus_result = scipy.stats.kruskal(*value_to_metric.values()) + except ValueError: + anova_krus_result = scipy.stats.stats.KruskalResult(np.nan, np.nan) + + # https://en.wikipedia.org/wiki/One-way_analysis_of_variance + # The One-Way ANOVA tests the null hypothesis, which states + # that samples in all groups are drawn from populations with + # the same mean values + if len(value_to_metric) > 1: + anova_1way_result = scipy.stats.f_oneway(*value_to_metric.values()) + else: + anova_1way_result = scipy.stats.stats.F_onewayResult(np.nan, np.nan) + + stats_row['anova_rank_H'] = anova_krus_result.statistic + stats_row['anova_rank_p'] = anova_krus_result.pvalue + stats_row['anova_mean_F'] = anova_1way_result.statistic + stats_row['anova_mean_p'] = anova_1way_result.pvalue + stats_row['moments'] = moments + + pairwise_statistics = [] + for pair in value_pairs: + pair_statistics = {} + # try: + # param_val1, param_val2 = sorted(pair) + # except Exception: + # param_val1, param_val2 = (pair) + param_val1, param_val2 = pair + + metric_vals1 = value_to_metric[param_val1] + metric_vals2 = value_to_metric[param_val2] + + rank1 = param_to_rank[param_val1] + rank2 = param_to_rank[param_val2] + pair_statistics['winner'] = param_val1 if rank1 < rank2 else param_val2 + pair_statistics['value1'] = param_val1 + pair_statistics['value2'] = param_val2 + pair_statistics['n1'] = len(metric_vals1) + pair_statistics['n2'] = len(metric_vals2) + # TODO: probably want to use an alternative=less or greater here + # instead of simply unequal + alternative = 'two-sided' + if 1: + if ascending: + # We want to minimize the metric + alternative = 'less' if rank1 < rank2 else 'greater' + else: + # We want to maximize the metric + alternative = 'greater' if rank1 < rank2 else 'less' + + ind_kw = dict( + equal_var=False, + alternative=alternative, + ) + ttest_ind_result = scipy.stats.ttest_ind(metric_vals1, metric_vals2, **ind_kw) + + if 0: + from benchmarker.benchmarker import stats_dict + stats1 = stats_dict(metric_vals1) + stats2 = stats_dict(metric_vals2) + scipy.stats.ttest_ind_from_stats( + stats1['mean'], stats1['std'], stats1['n'], + stats2['mean'], stats2['std'], stats2['n'], + **ind_kw + ) + # metric_vals1, metric_vals2, equal_var=False) + + scipy.stats.ttest_ind_from_stats + + pair_statistics['ttest_ind'] = ttest_ind_result + + # Do relative checks, need to find comparable subgroups + metric_group1 = value_to_metric_group[param_val1] + metric_group2 = value_to_metric_group[param_val2] + nuisance_vals1 = metric_group1[nuisance_cols] + nuisance_vals2 = metric_group2[nuisance_cols] + nk_to_group1 = dict(list(nuisance_vals1.groupby(nuisance_cols))) + nk_to_group2 = dict(list(nuisance_vals2.groupby(nuisance_cols))) + common = set(nk_to_group1) & set(nk_to_group2) + comparable_indexes1 = [] + comparable_indexes2 = [] + if common: + for nk in common: + group1 = nk_to_group1[nk] + group2 = nk_to_group2[nk] + for i, j in it.product(group1.index, group2.index): + comparable_indexes1.append(i) + comparable_indexes2.append(j) + + comparable_groups1 = metric_group1.loc[comparable_indexes1, metric_key] + comparable_groups2 = metric_group2.loc[comparable_indexes2, metric_key] + + # Does this need to have the values aligned? + ttest_rel_result = scipy.stats.ttest_rel(comparable_groups1, comparable_groups2) + pair_statistics['n_common'] = len(common) + pair_statistics['ttest_rel'] = ttest_rel_result + pairwise_statistics.append(pair_statistics) + + stats_row['pairwise'] = pairwise_statistics + return stats_row + + def build(self): + import itertools as it + if len(self.results) < 2: + raise Exception('need at least 2 results') + + varied = self.varied.copy() + if self.ignore_params: + for k in self.ignore_params: + varied.pop(k, None) + if self.params: + varied = ub.dict_isect(varied, self.params) + + # Experimental: + # Find Auto-abalation groups + # TODO: when the group size is -1, instead of showing all of the group + # settings, for each group setting do the k=1 analysis within that group + varied_param_names = list(varied.keys()) + num_varied_params = len(varied) + held_constant_orders = {num_varied_params + i if i < 0 else i for i in self.abalation_orders} + held_constant_orders = [i for i in held_constant_orders if i > 0] + held_constant_groups = [] + for k in held_constant_orders: + held_constant_groups.extend( + list(map(list, it.combinations(varied_param_names, k)))) + + if self.metrics is None: + avail_metrics = set.intersection(*[set(r.metrics.keys()) for r in self.results]) + metrics_of_interest = sorted(avail_metrics - set(self.ignore_metrics)) + else: + metrics_of_interest = self.metrics + self.metrics_of_interest = metrics_of_interest + self._description['metrics_of_interest'] = metrics_of_interest + self._description['num_groups'] = len(held_constant_groups) + + # Analyze the impact of each parameter + self.statistics = statistics = [] + for param_group in held_constant_groups: + for metric_key in metrics_of_interest: + stats_row = self.test_group(param_group, metric_key) + statistics.append(stats_row) + + self.stats_table = pd.DataFrame([ + ub.dict_diff(d, {'pairwise', 'param_values', 'moments'}) + for d in self.statistics]) + + if len(self.stats_table): + self.stats_table = self.stats_table.sort_values('anova_rank_p') + + self._description['built'] = True + + def report(self): + p_threshold = 0.05 + stat_groups = ub.group_items(self.statistics, key=lambda x: x['param_name']) + stat_groups_items = list(stat_groups.items()) + + # Modify this order to change the grouping pattern + grid = ub.named_product({ + 'stat_group_item': stat_groups_items, + 'metrics': self.metrics_of_interest, + }) + for grid_item in grid: + metric_key = grid_item['metrics'] + stat_groups_item = grid_item['stat_group_item'] + + param_name, stat_group = stat_groups_item + stats_row = ub.group_items(stat_group, key=lambda x: x['metric'])[metric_key][0] + title = ('PARAMETER: {} - METRIC: {}'.format(param_name, metric_key)) + print('\n\n') + print(title) + print('=' * len(title)) + print(stats_row['moments']) + anova_rank_p = stats_row['anova_rank_p'] + anova_mean_p = stats_row['anova_mean_p'] + # Rougly speaking + print('') + print(f'ANOVA: If p is low, the param {param_name!r} might have an effect') + print(ub.color_text(f' Rank-ANOVA: p={anova_rank_p:0.8f}', 'green' if anova_rank_p < p_threshold else None)) + print(ub.color_text(f' Mean-ANOVA: p={anova_mean_p:0.8f}', 'green' if anova_mean_p < p_threshold else None)) + print('') + print('Pairwise T-Tests') + for pairstat in stats_row['pairwise']: + # Is this backwards? + value1 = pairstat['value1'] + value2 = pairstat['value2'] + winner = pairstat['winner'] + if value2 == winner: + value1, value2 = value2, value1 + print(f' If p is low, {param_name}={value1} may outperform {param_name}={value2}.') + if 'ttest_ind' in pairstat: + ttest_ind_result = pairstat['ttest_ind'] + print(ub.color_text(f' ttest_ind: p={ttest_ind_result.pvalue:0.8f}', 'green' if ttest_ind_result.pvalue < p_threshold else None)) + if 'ttest_rel' in pairstat: + n_common = pairstat['n_common'] + ttest_rel_result = pairstat['ttest_ind'] + print(ub.color_text(f' ttest_rel: p={ttest_rel_result.pvalue:0.8f}, n={n_common}', 'green' if ttest_rel_result.pvalue < p_threshold else None)) + + print(self.stats_table) + + def conclusions(self): + conclusions = [] + for stat in self.statistics: + param_name = stat['param_name'] + metric = stat['metric'] + for pairstat in stat['pairwise']: + value1 = pairstat['value1'] + value2 = pairstat['value2'] + winner = pairstat['winner'] + if value2 == winner: + value1, value2 = value2, value1 + pvalue = stat = pairstat['ttest_ind'].pvalue + txt = (f'p={pvalue:0.8f}, If p is low, {param_name}={value1} may outperform {value2} on {metric}.') + conclusions.append(txt) + return conclusions + + +class SkillTracker: + """ + Wrapper around openskill + + Args: + player_ids (List[T]): + a list of ids (usually ints) used to represent each player + + Example: + >>> # xdoctest: +REQUIRES(module:openskill) + >>> self = SkillTracker([1, 2, 3, 4, 5]) + >>> self.observe([2, 3]) # Player 2 beat player 3. + >>> self.observe([1, 2, 5, 3]) # Player 3 didnt play this round. + >>> self.observe([2, 3, 4, 5, 1]) # Everyone played, player 2 won. + >>> win_probs = self.predict_win() + >>> print('win_probs = {}'.format(ub.repr2(win_probs, nl=1, precision=2))) + win_probs = { + 1: 0.20, + 2: 0.21, + 3: 0.19, + 4: 0.20, + 5: 0.20, + } + """ + + def __init__(self, player_ids): + import openskill + self.player_ids = player_ids + self.ratings = {m: openskill.Rating() for m in player_ids} + # self.observations = [] + + def predict_win(self): + """ + Estimate the probability that a particular player will win given the + current ratings. + + Returns: + Dict[T, float]: mapping from player ids to win probabilites + """ + from openskill import predict_win + teams = [[p] for p in list(self.ratings.keys())] + ratings = [[r] for r in self.ratings.values()] + probs = predict_win(ratings) + win_probs = {team[0]: prob for team, prob in zip(teams, probs)} + return win_probs + + def observe(self, ranking): + """ + After simulating a round, pass the ranked order of who won + (winner is first, looser is last) to this function. And it + updates the rankings. + + Args: + ranking (List[T]): + ranking of all the players that played in this round + winners are at the front (0-th place) of the list. + """ + import openskill + # self.observations.append(ranking) + ratings = self.ratings + team_standings = [[r] for r in ub.take(ratings, ranking)] + # new_values = openskill.rate(team_standings) # Not inplace + # new_ratings = [openskill.Rating(*new[0]) for new in new_values] + new_team_ratings = openskill.rate(team_standings) + new_ratings = [new[0] for new in new_team_ratings] + ratings.update(ub.dzip(ranking, new_ratings)) diff --git a/tests/benchmarker/util_json.py b/tests/benchmarker/util_json.py new file mode 100644 index 0000000..dc3da85 --- /dev/null +++ b/tests/benchmarker/util_json.py @@ -0,0 +1,233 @@ +import copy +import numpy as np +import ubelt as ub +import json +from collections import OrderedDict +import pathlib + + +def ensure_json_serializable(dict_, normalize_containers=False, verbose=0): + """ + Attempt to convert common types (e.g. numpy) into something json complient + + Convert numpy and tuples into lists + + Args: + normalize_containers (bool, default=False): + if True, normalizes dict containers to be standard python + structures. + + Example: + >>> data = ub.ddict(lambda: int) + >>> data['foo'] = ub.ddict(lambda: int) + >>> data['bar'] = np.array([1, 2, 3]) + >>> data['foo']['a'] = 1 + >>> data['foo']['b'] = (1, np.array([1, 2, 3]), {3: np.int32(3), 4: np.float16(1.0)}) + >>> dict_ = data + >>> print(ub.repr2(data, nl=-1)) + >>> assert list(find_json_unserializable(data)) + >>> result = ensure_json_serializable(data, normalize_containers=True) + >>> print(ub.repr2(result, nl=-1)) + >>> assert not list(find_json_unserializable(result)) + >>> assert type(result) is dict + """ + dict_ = copy.deepcopy(dict_) + + def _norm_container(c): + if isinstance(c, dict): + # Cast to a normal dictionary + if isinstance(c, OrderedDict): + if type(c) is not OrderedDict: + c = OrderedDict(c) + else: + if type(c) is not dict: + c = dict(c) + return c + + walker = ub.IndexableWalker(dict_) + for prefix, value in walker: + if isinstance(value, tuple): + new_value = list(value) + walker[prefix] = new_value + elif isinstance(value, np.ndarray): + new_value = value.tolist() + walker[prefix] = new_value + elif isinstance(value, (np.integer)): + new_value = int(value) + walker[prefix] = new_value + elif isinstance(value, (np.floating)): + new_value = float(value) + walker[prefix] = new_value + elif isinstance(value, (np.complexfloating)): + new_value = complex(value) + walker[prefix] = new_value + elif isinstance(value, pathlib.Path): + new_value = str(value) + walker[prefix] = new_value + elif hasattr(value, '__json__'): + new_value = value.__json__() + walker[prefix] = new_value + elif normalize_containers: + if isinstance(value, dict): + new_value = _norm_container(value) + walker[prefix] = new_value + + if normalize_containers: + # normalize the outer layer + dict_ = _norm_container(dict_) + return dict_ + + +def find_json_unserializable(data, quickcheck=False): + """ + Recurse through json datastructure and find any component that + causes a serialization error. Record the location of these errors + in the datastructure as we recurse through the call tree. + + Args: + data (object): data that should be json serializable + quickcheck (bool): if True, check the entire datastructure assuming + its ok before doing the python-based recursive logic. + + Returns: + List[Dict]: list of "bad part" dictionaries containing items + 'value' - the value that caused the serialization error + 'loc' - which contains a list of key/indexes that can be used + to lookup the location of the unserializable value. + If the "loc" is a list, then it indicates a rare case where + a key in a dictionary is causing the serialization error. + + Example: + >>> part = ub.ddict(lambda: int) + >>> part['foo'] = ub.ddict(lambda: int) + >>> part['bar'] = np.array([1, 2, 3]) + >>> part['foo']['a'] = 1 + >>> # Create a dictionary with two unserializable parts + >>> data = [1, 2, {'nest1': [2, part]}, {frozenset({'badkey'}): 3, 2: 4}] + >>> parts = list(find_json_unserializable(data)) + >>> print('parts = {}'.format(ub.repr2(parts, nl=1))) + >>> # Check expected structure of bad parts + >>> assert len(parts) == 2 + >>> part = parts[1] + >>> assert list(part['loc']) == [2, 'nest1', 1, 'bar'] + >>> # We can use the "loc" to find the bad value + >>> for part in parts: + >>> # "loc" is a list of directions containing which keys/indexes + >>> # to traverse at each descent into the data structure. + >>> directions = part['loc'] + >>> curr = data + >>> special_flag = False + >>> for key in directions: + >>> if isinstance(key, list): + >>> # special case for bad keys + >>> special_flag = True + >>> break + >>> else: + >>> # normal case for bad values + >>> curr = curr[key] + >>> if special_flag: + >>> assert part['data'] in curr.keys() + >>> assert part['data'] is key[1] + >>> else: + >>> assert part['data'] is curr + """ + needs_check = True + if quickcheck: + try: + # Might be a more efficient way to do this check. We duplicate a lot of + # work by doing the check for unserializable data this way. + json.dumps(data) + except Exception: + # If there is unserializable data, find out where it is. + # is_serializable = False + pass + else: + # is_serializable = True + needs_check = False + + if needs_check: + # mode = 'new' + # if mode == 'new': + scalar_types = (int, float, str, type(None)) + container_types = (tuple, list, dict) + serializable_types = scalar_types + container_types + walker = ub.IndexableWalker(data) + for prefix, value in walker: + *root, key = prefix + if not isinstance(key, scalar_types): + # Special case where a dict key is the error value + # Purposely make loc non-hashable so its not confused with + # an address. All we can know in this case is that they key + # is at this level, there is no concept of where. + yield {'loc': root + [['.keys', key]], 'data': key} + elif not isinstance(value, serializable_types): + yield {'loc': prefix, 'data': value} + + +def indexable_allclose(dct1, dct2, return_info=False): + """ + Walks through two nested data structures and ensures that everything is + roughly the same. + + Args: + dct1: a nested indexable item + dct2: a nested indexable item + + Example: + >>> dct1 = { + >>> 'foo': [1.222222, 1.333], + >>> 'bar': 1, + >>> 'baz': [], + >>> } + >>> dct2 = { + >>> 'foo': [1.22222, 1.333], + >>> 'bar': 1, + >>> 'baz': [], + >>> } + >>> assert indexable_allclose(dct1, dct2) + """ + walker1 = ub.IndexableWalker(dct1) + walker2 = ub.IndexableWalker(dct2) + flat_items1 = [ + (path, value) for path, value in walker1 + if not isinstance(value, walker1.indexable_cls) or len(value) == 0] + flat_items2 = [ + (path, value) for path, value in walker2 + if not isinstance(value, walker1.indexable_cls) or len(value) == 0] + + flat_items1 = sorted(flat_items1) + flat_items2 = sorted(flat_items2) + + if len(flat_items1) != len(flat_items2): + info = { + 'faillist': ['length mismatch'] + } + final_flag = False + else: + passlist = [] + faillist = [] + + for t1, t2 in zip(flat_items1, flat_items2): + p1, v1 = t1 + p2, v2 = t2 + assert p1 == p2 + + flag = (v1 == v2) + if not flag: + if isinstance(v1, float) and isinstance(v2, float) and np.isclose(v1, v2): + flag = True + if flag: + passlist.append(p1) + else: + faillist.append((p1, v1, v2)) + + final_flag = len(faillist) == 0 + info = { + 'passlist': passlist, + 'faillist': faillist, + } + + if return_info: + return final_flag, info + else: + return final_flag diff --git a/tests/benchmarker/visualize.py b/tests/benchmarker/visualize.py new file mode 100644 index 0000000..41f4679 --- /dev/null +++ b/tests/benchmarker/visualize.py @@ -0,0 +1,113 @@ +import pandas as pd +import ubelt as ub + + +def benchmark_analysis(rows, xlabel, group_labels, basis, ): + # xlabel = "size" + # Set these to empty lists if they are not used + # group_labels = { + # "col": ["input"], + # "hue": ["impl"], + # "size": [], + # } + # group_keys = {} + # for gname, labels in group_labels.items(): + # group_keys[gname + "_key"] = ub.repr2( + # ub.dict_isect(params, labels), compact=1, si=1 + # ) + # key = ub.repr2(params, compact=1, si=1) + + from process_tracker.result_analysis import SkillTracker + RECORD_ALL = 0 + + USE_OPENSKILL = True + + RECORD_ALL = 0 + metric_key = "time" if RECORD_ALL else "min" + + # The rows define a long-form pandas data array. + # Data in long-form makes it very easy to use seaborn. + data = pd.DataFrame(rows) + data = data.sort_values(metric_key) + + if RECORD_ALL: + # Show the min / mean if we record all + min_times = data.groupby("key").min().rename({"time": "min"}, axis=1) + mean_times = ( + data.groupby("key")[["time"]].mean().rename({"time": "mean"}, axis=1) + ) + stats_data = pd.concat([min_times, mean_times], axis=1) + stats_data = stats_data.sort_values("min") + else: + stats_data = data + + if USE_OPENSKILL: + # Track the "skill" of each method + # The idea is that each setting of parameters is a game, and each + # "impl" is a player. We rank the players by which is fastest, and + # update their ranking according to the Weng-Lin Bayes ranking model. + # This does not take the fact that some "games" (i.e. parameter + # settings) are more important than others, but it should be fairly + # robust on average. + skillboard = SkillTracker(basis["impl"]) + + other_keys = sorted( + set(stats_data.columns) + - {"key", "impl", "min", "mean", "hue_key", "size_key", "style_key"} + ) + for params, variants in stats_data.groupby(other_keys): + variants = variants.sort_values("mean") + ranking = variants["impl"].reset_index(drop=True) + + mean_speedup = variants["mean"].max() / variants["mean"] + stats_data.loc[mean_speedup.index, "mean_speedup"] = mean_speedup + min_speedup = variants["min"].max() / variants["min"] + stats_data.loc[min_speedup.index, "min_speedup"] = min_speedup + + if USE_OPENSKILL: + skillboard.observe(ranking) + + print("Statistics:") + print(stats_data) + + if USE_OPENSKILL: + win_probs = skillboard.predict_win() + win_probs = ub.sorted_vals(win_probs, reverse=True) + print( + "Aggregated Rankings = {}".format( + ub.repr2(win_probs, nl=1, precision=4, align=":") + ) + ) + + plot = True + if plot: + # import seaborn as sns + # kwplot autosns works well for IPython and script execution. + # not sure about notebooks. + import seaborn as sns + + sns.set() + from matplotlib import pyplot as plt + + plotkw = {} + for gname, labels in group_labels.items(): + if labels: + plotkw[gname] = gname + "_key" + + # Your variables may change + # ax = plt.figure().gca() + col = plotkw.pop("col") + facet = sns.FacetGrid(data, col=col, sharex=False, sharey=False) + facet.map_dataframe(sns.lineplot, x=xlabel, y=metric_key, marker="o", **plotkw) + facet.add_legend() + # sns.lineplot(data=data, ) + # ax.set_title('JSON Benchmarks') + # ax.set_xlabel('Size') + # ax.set_ylabel('Time') + # ax.set_xscale('log') + # ax.set_yscale('log') + + try: + __IPYTHON__ + except NameError: + plt.show() From d036df252ffcc77beec3cbdb0e0f5290eba7195d Mon Sep 17 00:00:00 2001 From: joncrall Date: Thu, 26 May 2022 07:36:27 -0400 Subject: [PATCH 03/25] Port datasets --- tests/benchmark3.py | 217 ++++++++++++++------ tests/benchmarker/_test_ttest.py | 28 --- tests/benchmarker/benchmarker.py | 38 ++-- tests/benchmarker/result_analysis.py | 293 ++++++++++++++++++--------- 4 files changed, 368 insertions(+), 208 deletions(-) delete mode 100644 tests/benchmarker/_test_ttest.py diff --git a/tests/benchmark3.py b/tests/benchmark3.py index 181b2a4..b6e084e 100644 --- a/tests/benchmark3.py +++ b/tests/benchmark3.py @@ -8,58 +8,136 @@ import sys import ubelt as ub -def data_lut(input, size): - if input == "Array with UTF-8 strings": - test_object = [] - for x in range(size): - test_object.append( - "نظام الحكم سلطاني وراثي " - "في الذكور من ذرية السيد تركي بن سعيد بن سلطان ويشترط فيمن يختار لولاية" - " الحكم من بينهم ان يكون مسلما رشيدا عاقلا ًوابنا شرعيا لابوين عمانيين " - ) +def json_test_data_generators(): + """ + Generates data for benchmarks with various sizes + + Returns: + Dict[str, callable]: + a mapping from test data name to its generator + + Example: + >>> data_lut = json_test_data_generators() + >>> size = 2 + >>> keys = sorted(set(data_lut) - {'Complex object'}) + >>> for key in keys: + >>> func = data_lut[key] + >>> test_object = func(size) + >>> print('key = {!r}'.format(key)) + >>> print('test_object = {!r}'.format(test_object)) + """ + data_lut = {} + def _register_data(name): + def _wrap(func): + data_lut[name] = func + return _wrap + + # seed if desired + #rng = random.Random() + rng = random + + @_register_data('Array with doubles') + def array_with_doubles(size): + test_object = [sys.maxsize * rng.random() for _ in range(size)] return test_object - elif input == "Array with doubles": - test_object = [] - for x in range(256): - test_object.append(sys.maxsize * random.random()) - else: - raise KeyError(input) + + @_register_data('Array with UTF-8 strings') + def array_with_utf8_strings(size): + utf8_string = ( + "نظام الحكم سلطاني وراثي " + "في الذكور من ذرية السيد تركي بن سعيد بن سلطان ويشترط فيمن يختار لولاية" + " الحكم من بينهم ان يكون مسلما رشيدا عاقلا ًوابنا شرعيا لابوين عمانيين " + ) + test_object = [utf8_string for _ in range(size)] + return test_object + + @_register_data('Medium complex object') + def medium_complex_object(size): + user = { + "userId": 3381293, + "age": 213, + "username": "johndoe", + "fullname": "John Doe the Second", + "isAuthorized": True, + "liked": 31231.31231202, + "approval": 31.1471, + "jobs": [1, 2], + "currJob": None, + } + friends = [user, user, user, user, user, user, user, user] + test_object = [[user, friends] for _ in range(size)] + return test_object + + @_register_data('Array with True values') + def true_values(size): + test_object = [True for _ in range(size)] + return test_object + + @_register_data('Array of Dict[str, int]') + def array_of_dict_string_int(size): + test_object = [ + {str(rng.random() * 20): int(rng.random() * 1000000)} + for _ in range(size) + ] + return test_object + + @_register_data('Dict of List[Dict[str, int]]') + def dict_of_list_dict_str_int(size): + keys = set() + while len(keys) < size: + key = str(rng.random() * 20) + keys.add(key) + test_object = { + key: [ + {str(rng.random() * 20): int(rng.random() * 1000000)} + for _ in range(256) + ] + for key in keys + } + return test_object + + @_register_data('Complex object') + def complex_object(size): + import json + # TODO: might be better to reigster this file with setup.py or + # download it via some mechanism + try: + dpath = ub.Path(__file__).parent + fpath = dpath / 'sample.json' + if not fpath.exists(): + raise Exception + except Exception: + import ujson + dpath = ub.Path(ujson.__file__).parent / 'tests' + fpath = dpath / 'sample.json' + if not fpath.exists(): + raise Exception + with open(fpath, 'r') as f: + test_object = json.load(f) + if size > 1: + test_object = [test_object] * size + return test_object + + return data_lut def available_json_impls(): - JSON_IMPLS = {} - - try: - import json - JSON_IMPLS["json"] = json - except ImportError: - pass - - try: - import ujson - JSON_IMPLS["ujson"] = ujson - except ImportError: - pass - - try: - import nujson - JSON_IMPLS["nujson"] = nujson - except ImportError: - pass - - try: - import orjson - JSON_IMPLS["nujson"] = orjson - except ImportError: - pass - - try: - import simplejson - JSON_IMPLS["simplejson"] = simplejson - except ImportError: - pass - - return JSON_IMPLS + import importlib + known_modnames = [ + 'ujson', 'json', 'nujson', 'orjson', 'simplejson' + ] + json_impls = {} + for libname in known_modnames: + try: + module = importlib.import_module(libname) + except ImportError: + pass + else: + json_impls[libname] = { + 'module': module, + 'version': module.__version__, + } + return json_impls def benchmark_json_dumps(): @@ -67,28 +145,34 @@ def benchmark_json_dumps(): sys.path.append(ub.expandpath('~/code/ultrajson/tests')) from benchmarker import Benchmarker - JSON_IMPLS = available_json_impls() + json_impls = available_json_impls() + data_lut = json_test_data_generators() - version_infos = {k: v.__version__ for k, v in JSON_IMPLS.items()} - - def method_lut(impl): - return JSON_IMPLS[impl].dumps + list(data_lut.keys()) # These are the parameters that we benchmark over basis = { "input": [ - "Array with UTF-8 strings", - "Array with doubles", + 'Array with doubles', + 'Array with UTF-8 strings', + # 'Medium complex object', + 'Array with True values', + 'Array of Dict[str, int]', + # 'Dict of List[Dict[str, int]]', + # 'Complex object' ], "size": [1, 32, 256, 1024, 2048], - "impl": list(JSON_IMPLS.keys()), + "impl": list(json_impls.keys()), } + # The Benchmarker class is a new experimental API around timerit to + # abstract away the details of timing a process over a grid of parameters, + # serializing the results, and aggregating results from disparate runs. benchmark = Benchmarker( name='bench_json_dumps', - # Change params here to modify number of trials num=100, bestof=10, + verbose=2, basis=basis, ) @@ -96,11 +180,11 @@ def benchmark_json_dumps(): for params in benchmark.iter_params(): # Make any modifications you need to compute input kwargs for each # method here. - impl = params["impl"] - impl_version = version_infos[impl] + impl_info = json_impls[params["impl"]] + method = impl_info['module'].dumps + impl_version = impl_info['version'] params["impl_version"] = impl_version - method = method_lut(impl) - data = data_lut(params["input"], params["size"]) + data = data_lut[params["input"]](params["size"]) # Timerit will run some user-specified number of loops. # and compute time stats with similar methodology to timeit for timer in benchmark.measure(): @@ -114,20 +198,25 @@ def benchmark_json_dumps(): benchmark.dump_in_dpath(dpath) RECORD_ALL = 0 - metric_key = "time" if RECORD_ALL else "mean" + metric_key = "time" if RECORD_ALL else "mean_time" from benchmarker import result_analysis results = benchmark.result.to_result_list() + analysis = result_analysis.ResultAnalysis( results, metrics=[metric_key], params=['impl'], metric_objectives={ - 'min': 'min', - 'mean': 'min', + 'min_time': 'min', + 'mean_time': 'min', 'time': 'min', }) analysis.analysis() + analysis.table + + param_group = ['impl', 'impl_version'] + analysis.abalate(param_group) # benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL) diff --git a/tests/benchmarker/_test_ttest.py b/tests/benchmarker/_test_ttest.py deleted file mode 100644 index 4e83a5d..0000000 --- a/tests/benchmarker/_test_ttest.py +++ /dev/null @@ -1,28 +0,0 @@ - -def check_ttest(): - import scipy - import scipy.stats # NOQA - from benchmarker.benchmarker import stats_dict - import numpy as np - metric_vals1 = np.random.randn(10000) + 0.01 - metric_vals2 = np.random.randn(1000) - - stats1 = stats_dict(metric_vals1) - stats2 = stats_dict(metric_vals2) - - ind_kw = dict( - equal_var=0, - # alternative='two-sided' - alternative='less' if stats1['mean'] < stats2['mean'] else 'greater' - ) - - # Not sure why these are slightly different - res1 = scipy.stats.ttest_ind(metric_vals1, metric_vals2, **ind_kw) - - res2 = scipy.stats.ttest_ind_from_stats( - stats1['mean'], stats1['std'], stats1['n'], - stats2['mean'], stats2['std'], stats2['n'], - **ind_kw - ) - print('res1 = {!r}'.format(res1)) - print('res2 = {!r}'.format(res2)) diff --git a/tests/benchmarker/benchmarker.py b/tests/benchmarker/benchmarker.py index b488fb3..1050d6a 100644 --- a/tests/benchmarker/benchmarker.py +++ b/tests/benchmarker/benchmarker.py @@ -8,9 +8,9 @@ from benchmarker.process_context import ProcessContext @dataclass class BenchmarkerConfig: - name : str = None - num : int = 100 - bestof : int = 10 + name : str = None + num : int = 100 + bestof : int = 10 class BenchmarkerResult: @@ -97,14 +97,16 @@ class Benchmarker: >>> dpath = ub.Path.appdir('benchmarker/demo').ensuredir() >>> self.dump_in_dpath(dpath) """ - def __init__(self, basis={}, **kwargs): + def __init__(self, basis={}, verbose=1, **kwargs): self.basis = basis self.config = BenchmarkerConfig(**kwargs) self.ti = timerit.Timerit( num=self.config.num, - bestof=self.config.bestof) + bestof=self.config.bestof, + verbose=verbose, + ) self.context = ProcessContext(name=self.config.name) self.rows = [] self.RECORD_ALL = 0 @@ -152,7 +154,7 @@ class Benchmarker: rows.append(row) else: times = np.array(ti.robust_times()) - metrics = stats_dict(times) + metrics = stats_dict(times, '_time') row = { 'metrics': metrics, 'params': params, @@ -161,13 +163,13 @@ class Benchmarker: rows.append(row) -def stats_dict(data): +def stats_dict(data, suffix=''): stats = { - 'n': len(data), - 'mean': data.mean(), - 'std': data.std(), - 'min': data.min(), - 'max': data.max(), + 'nobs' + suffix: len(data), + 'mean' + suffix: data.mean(), + 'std' + suffix: data.std(), + 'min' + suffix: data.min(), + 'max' + suffix: data.max(), } return stats @@ -182,12 +184,12 @@ def combine_stats(s1, s2): Example: >>> basis = { - >>> 'n1': [1, 10, 100, 10000], - >>> 'n2': [1, 10, 100, 10000], + >>> 'nobs1': [1, 10, 100, 10000], + >>> 'nobs2': [1, 10, 100, 10000], >>> } >>> for params in ub.named_product(basis): - >>> data1 = np.random.rand(params['n1']) - >>> data2 = np.random.rand(params['n2']) + >>> data1 = np.random.rand(params['nobs1']) + >>> data2 = np.random.rand(params['nobs2']) >>> data3 = np.hstack([data1, data2]) >>> s1 = stats_dict(data1) >>> s2 = stats_dict(data2) @@ -203,7 +205,7 @@ def combine_stats(s1, s2): https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups """ stats = [s1, s2] - sizes = np.array([s['n'] for s in stats]) + sizes = np.array([s['nobs'] for s in stats]) means = np.array([s['mean'] for s in stats]) stds = np.array([s['std'] for s in stats]) mins = np.array([s['min'] for s in stats]) @@ -221,7 +223,7 @@ def combine_stats(s1, s2): combo_std = np.sqrt(combo_vars) combo_stats = { - 'n': combo_size, + 'nobs': combo_size, 'mean': combo_mean, 'std': combo_std, 'min': mins.min(), diff --git a/tests/benchmarker/result_analysis.py b/tests/benchmarker/result_analysis.py index 1067b3e..fd56edb 100644 --- a/tests/benchmarker/result_analysis.py +++ b/tests/benchmarker/result_analysis.py @@ -8,6 +8,19 @@ import scipy import scipy.stats # NOQA +# a list of common objectives +DEFAULT_METRIC_TO_OBJECTIVE = { + 'time': 'min', + 'ap': 'max', + 'acc': 'max', + 'f1': 'max', + 'mcc': 'max', + # + 'loss': 'min', + 'brier': 'min', +} + + class Result(ub.NiceRepr): """ Storage of names, parameters, and quality metrics for a single experiment. @@ -31,6 +44,10 @@ class Result(ub.NiceRepr): >>> self = Result.demo(rng=32) >>> print('self = {}'.format(self)) self = + + Example: + >>> self = Result.demo(mode='alt', rng=32) + >>> print('self = {}'.format(self)) """ def __init__(self, name, params, metrics, meta=None): self.name = name @@ -48,21 +65,43 @@ class Result(ub.NiceRepr): return text @classmethod - def demo(cls, rng=None): + def demo(cls, mode='null', rng=None): import numpy as np import string import kwarray rng = kwarray.ensure_rng(rng) - demo_param_space = { - 'param1': list(range(3)), - 'param2': np.linspace(0, 10, 10), - 'param3': list(string.ascii_lowercase[0:3]), - } - params = {k: rng.choice(b) for k, b in demo_param_space.items()} - metrics = { - 'f1': rng.rand(), - 'acc': rng.rand(), - } + + if mode == 'null': + # The null hypothesis should generally be true here, + # there is no relation between the results and parameters + demo_param_space = { + 'param1': list(range(3)), + 'param2': np.linspace(0, 10, 10), + 'param3': list(string.ascii_lowercase[0:3]), + } + params = {k: rng.choice(b) for k, b in demo_param_space.items()} + metrics = { + 'f1': rng.rand(), + 'acc': rng.rand(), + } + elif mode == 'alt': + # The alternative hypothesis should be true here, there is a + # relationship between results two of the params. + from scipy.special import expit + params = { + 'w': rng.randint(-1, 1), + 'x': rng.randint(-3, 3), + 'y': rng.randint(-2, 2), + 'z': rng.randint(-3, 3), + } + noise = np.random.randn() * 1 + r = 3 * params['x'] + params['y'] ** 2 + 0.3 * params['z'] ** 3 + acc = expit(r / 20 + noise) + metrics = { + 'acc': acc, + } + else: + raise KeyError(mode) name = ub.hash_data(params)[0:8] self = cls(name, params, metrics) return self @@ -105,6 +144,10 @@ class ResultAnalysis(ub.NiceRepr): >>> self = ResultAnalysis.demo() >>> self.analysis() + Example: + >>> self = ResultAnalysis.demo(num=5000, mode='alt') + >>> self.analysis() + Example: >>> # Given a list of experiments, configs, and results >>> # Create a ResultAnalysis object @@ -168,7 +211,8 @@ class ResultAnalysis(ub.NiceRepr): def __init__(self, results, metrics=None, params=None, ignore_params=None, ignore_metrics=None, metric_objectives=None, - abalation_orders={1}, default_objective='max'): + abalation_orders={1}, default_objective='max', + p_threshold=0.05): self.results = results if ignore_metrics is None: ignore_metrics = set() @@ -181,23 +225,15 @@ class ResultAnalysis(ub.NiceRepr): self.default_objective = default_objective # encode if we want to maximize or minimize a metric - default_metric_to_objective = { - 'ap': 'max', - 'acc': 'max', - 'f1': 'max', - # - 'loss': 'min', - 'brier': 'min', - } if metric_objectives is None: metric_objectives = {} - - self.metric_objectives = default_metric_to_objective.copy() + self.metric_objectives = DEFAULT_METRIC_TO_OBJECTIVE.copy() self.metric_objectives.update(metric_objectives) self.params = params self.metrics = metrics self.statistics = None + self.p_threshold = p_threshold self._description = {} self._description['built'] = False @@ -210,11 +246,14 @@ class ResultAnalysis(ub.NiceRepr): return ub.repr2(self._description, si=1, sv=1) @classmethod - def demo(cls, num=10, rng=None): + def demo(cls, num=10, mode='null', rng=None): import kwarray rng = kwarray.ensure_rng(rng) - results = [Result.demo(rng=rng) for _ in range(num)] - self = cls(results, metrics={'f1', 'acc'}) + results = [Result.demo(mode=mode, rng=rng) for _ in range(num)] + if mode == 'null': + self = cls(results, metrics={'f1', 'acc'}) + else: + self = cls(results, metrics={'acc'}) return self def run(self): @@ -251,18 +290,30 @@ class ResultAnalysis(ub.NiceRepr): varied = {k: vs for k, vs in varied.items() if len(vs)} return varied - def abalation_groups(self, param): + def abalation_groups(self, param_group, k=2): """ Return groups where the specified parameter(s) are varied, but all other non-ignored parameters are held the same. + Args: + param_group (str | List[str]): + One or more parameters that are allowed to vary + + k (int): + minimum number of items a group must contain to be returned + + Returns: + List[DataFrame]: + a list of subsets of in the table where all but the specified + (non-ignored) parameters are allowed to vary. + Example: >>> self = ResultAnalysis.demo() >>> param = 'param2' >>> self.abalation_groups(param) """ - if not ub.iterable(param): - param = [param] + if not ub.iterable(param_group): + param_group = [param_group] table = self.table config_rows = [r.params for r in self.results] config_keys = list(map(set, config_rows)) @@ -271,14 +322,14 @@ class ResultAnalysis(ub.NiceRepr): if self.ignore_params: config_keys = [c - self.ignore_params for c in config_keys] isect_params = set.intersection(*config_keys) - other_params = sorted(isect_params - set(param)) + other_params = sorted(isect_params - set(param_group)) groups = [] for key, group in table.groupby(other_params, dropna=False): - if len(group) > 1: + if len(group) >= k: groups.append(group) return groups - def abalate(self, param): + def abalate(self, param_group): """ Example: >>> self = ResultAnalysis.demo(100) @@ -287,34 +338,34 @@ class ResultAnalysis(ub.NiceRepr): >>> self.abalate(param) >>> self = ResultAnalysis.demo() - >>> param = ['param2', 'param3'] - >>> self.abalate(param) + >>> param_group = ['param2', 'param3'] + >>> # xdoctest: +REQUIRES(module:openskill) + >>> self.abalate(param_group) """ - import itertools as it if self.table is None: self.table = self.build_table() - if not ub.iterable(param): - param = [param] + if not ub.iterable(param_group): + param_group = [param_group] # For hashable generic dictionary from collections import namedtuple - gd = namedtuple('config', param) + gd = namedtuple('config', param_group) # from types import SimpleNamespace - param_unique_vals_ = self.table[param].drop_duplicates().to_dict('records') + param_unique_vals_ = self.table[param_group].drop_duplicates().to_dict('records') param_unique_vals = [gd(**d) for d in param_unique_vals_] - # param_unique_vals = {p: self.table[p].unique().tolist() for p in param} + # param_unique_vals = {p: self.table[p].unique().tolist() for p in param_group} score_improvements = ub.ddict(list) scored_obs = [] skillboard = SkillTracker(param_unique_vals) - groups = self.abalation_groups(param) + groups = self.abalation_groups(param_group, k=2) for group in groups: for metric_key in self.metrics: ascending = self._objective_is_ascending(metric_key) group = group.sort_values(metric_key, ascending=ascending) - subgroups = group.groupby(param) + subgroups = group.groupby(param_group) if ascending: best_idx = subgroups[metric_key].idxmax() else: @@ -326,19 +377,19 @@ class ResultAnalysis(ub.NiceRepr): if x1 != x2: r1 = best_group.loc[x1] r2 = best_group.loc[x2] - k1 = gd(**r1[param]) - k2 = gd(**r2[param]) + k1 = gd(**r1[param_group]) + k2 = gd(**r2[param_group]) diff = r1[metric_key] - r2[metric_key] score_improvements[(k1, k2, metric_key)].append(diff) # metric_vals = best_group[metric_key].values # diffs = metric_vals[None, :] - metric_vals[:, None] - best_group.set_index(param) - # best_group[param] + best_group.set_index(param_group) + # best_group[param_group] # best_group[metric_key].diff() - scored_ranking = best_group[param + [metric_key]].reset_index(drop=True) + scored_ranking = best_group[param_group + [metric_key]].reset_index(drop=True) scored_obs.append(scored_ranking) - ranking = [gd(**d) for d in scored_ranking[param].to_dict('records')] + ranking = [gd(**d) for d in scored_ranking[param_group].to_dict('records')] skillboard.observe(ranking) print('skillboard.ratings = {}'.format(ub.repr2(skillboard.ratings, nl=1, align=':'))) @@ -377,15 +428,12 @@ class ResultAnalysis(ub.NiceRepr): # TODO : document these stats clearly and accurately Example: - >>> self = ResultAnalysis.demo(num=30) + >>> self = ResultAnalysis.demo(num=100) >>> print(self.table) - >>> param_group = ['param2'] + >>> param_group = ['param2', 'param1'] >>> metric_key = 'f1' >>> stats_row = self.test_group(param_group, metric_key) - >>> print('stats_row = {}'.format(ub.repr2(stats_row, nl=2, precision=2))) - >>> # --- - >>> self.build() - >>> self.report() + >>> print('stats_row = {}'.format(ub.repr2(stats_row, nl=2, sort=0, precision=2))) """ param_group_name = ','.join(param_group) stats_row = { @@ -461,10 +509,6 @@ class ResultAnalysis(ub.NiceRepr): pairwise_statistics = [] for pair in value_pairs: pair_statistics = {} - # try: - # param_val1, param_val2 = sorted(pair) - # except Exception: - # param_val1, param_val2 = (pair) param_val1, param_val2 = pair metric_vals1 = value_to_metric[param_val1] @@ -477,16 +521,17 @@ class ResultAnalysis(ub.NiceRepr): pair_statistics['value2'] = param_val2 pair_statistics['n1'] = len(metric_vals1) pair_statistics['n2'] = len(metric_vals2) - # TODO: probably want to use an alternative=less or greater here - # instead of simply unequal - alternative = 'two-sided' - if 1: + + TEST_ONLY_FOR_DIFFERENCE = True + if TEST_ONLY_FOR_DIFFERENCE: if ascending: # We want to minimize the metric alternative = 'less' if rank1 < rank2 else 'greater' else: # We want to maximize the metric alternative = 'greater' if rank1 < rank2 else 'less' + else: + alternative = 'two-sided' ind_kw = dict( equal_var=False, @@ -499,8 +544,8 @@ class ResultAnalysis(ub.NiceRepr): stats1 = stats_dict(metric_vals1) stats2 = stats_dict(metric_vals2) scipy.stats.ttest_ind_from_stats( - stats1['mean'], stats1['std'], stats1['n'], - stats2['mean'], stats2['std'], stats2['n'], + stats1['mean'], stats1['std'], stats1['nobs'], + stats2['mean'], stats2['std'], stats2['nobs'], **ind_kw ) # metric_vals1, metric_vals2, equal_var=False) @@ -523,6 +568,8 @@ class ResultAnalysis(ub.NiceRepr): for nk in common: group1 = nk_to_group1[nk] group2 = nk_to_group2[nk] + # TODO: Not sure if taking the product of everything within + # the comparable group is correct or not. I think it is ok. for i, j in it.product(group1.index, group2.index): comparable_indexes1.append(i) comparable_indexes2.append(j) @@ -590,7 +637,6 @@ class ResultAnalysis(ub.NiceRepr): self._description['built'] = True def report(self): - p_threshold = 0.05 stat_groups = ub.group_items(self.statistics, key=lambda x: x['param_name']) stat_groups_items = list(stat_groups.items()) @@ -600,43 +646,47 @@ class ResultAnalysis(ub.NiceRepr): 'metrics': self.metrics_of_interest, }) for grid_item in grid: - metric_key = grid_item['metrics'] - stat_groups_item = grid_item['stat_group_item'] - - param_name, stat_group = stat_groups_item - stats_row = ub.group_items(stat_group, key=lambda x: x['metric'])[metric_key][0] - title = ('PARAMETER: {} - METRIC: {}'.format(param_name, metric_key)) - print('\n\n') - print(title) - print('=' * len(title)) - print(stats_row['moments']) - anova_rank_p = stats_row['anova_rank_p'] - anova_mean_p = stats_row['anova_mean_p'] - # Rougly speaking - print('') - print(f'ANOVA: If p is low, the param {param_name!r} might have an effect') - print(ub.color_text(f' Rank-ANOVA: p={anova_rank_p:0.8f}', 'green' if anova_rank_p < p_threshold else None)) - print(ub.color_text(f' Mean-ANOVA: p={anova_mean_p:0.8f}', 'green' if anova_mean_p < p_threshold else None)) - print('') - print('Pairwise T-Tests') - for pairstat in stats_row['pairwise']: - # Is this backwards? - value1 = pairstat['value1'] - value2 = pairstat['value2'] - winner = pairstat['winner'] - if value2 == winner: - value1, value2 = value2, value1 - print(f' If p is low, {param_name}={value1} may outperform {param_name}={value2}.') - if 'ttest_ind' in pairstat: - ttest_ind_result = pairstat['ttest_ind'] - print(ub.color_text(f' ttest_ind: p={ttest_ind_result.pvalue:0.8f}', 'green' if ttest_ind_result.pvalue < p_threshold else None)) - if 'ttest_rel' in pairstat: - n_common = pairstat['n_common'] - ttest_rel_result = pairstat['ttest_ind'] - print(ub.color_text(f' ttest_rel: p={ttest_rel_result.pvalue:0.8f}, n={n_common}', 'green' if ttest_rel_result.pvalue < p_threshold else None)) + self._report_one(grid_item) print(self.stats_table) + def _report_one(self, grid_item): + p_threshold = self.p_threshold + metric_key = grid_item['metrics'] + stat_groups_item = grid_item['stat_group_item'] + + param_name, stat_group = stat_groups_item + stats_row = ub.group_items(stat_group, key=lambda x: x['metric'])[metric_key][0] + title = ('PARAMETER: {} - METRIC: {}'.format(param_name, metric_key)) + print('\n\n') + print(title) + print('=' * len(title)) + print(stats_row['moments']) + anova_rank_p = stats_row['anova_rank_p'] + anova_mean_p = stats_row['anova_mean_p'] + # Rougly speaking + print('') + print(f'ANOVA: If p is low, the param {param_name!r} might have an effect') + print(ub.color_text(f' Rank-ANOVA: p={anova_rank_p:0.8f}', 'green' if anova_rank_p < p_threshold else None)) + print(ub.color_text(f' Mean-ANOVA: p={anova_mean_p:0.8f}', 'green' if anova_mean_p < p_threshold else None)) + print('') + print('Pairwise T-Tests') + for pairstat in stats_row['pairwise']: + # Is this backwards? + value1 = pairstat['value1'] + value2 = pairstat['value2'] + winner = pairstat['winner'] + if value2 == winner: + value1, value2 = value2, value1 + print(f' If p is low, {param_name}={value1} may outperform {param_name}={value2}.') + if 'ttest_ind' in pairstat: + ttest_ind_result = pairstat['ttest_ind'] + print(ub.color_text(f' ttest_ind: p={ttest_ind_result.pvalue:0.8f}', 'green' if ttest_ind_result.pvalue < p_threshold else None)) + if 'ttest_rel' in pairstat: + n_common = pairstat['n_common'] + ttest_rel_result = pairstat['ttest_ind'] + print(ub.color_text(f' ttest_rel: p={ttest_rel_result.pvalue:0.8f}, n_pairs={n_common}', 'green' if ttest_rel_result.pvalue < p_threshold else None)) + def conclusions(self): conclusions = [] for stat in self.statistics: @@ -653,6 +703,50 @@ class ResultAnalysis(ub.NiceRepr): conclusions.append(txt) return conclusions + def plot(self, xlabel, metric_key, group_labels): + """ + Example: + >>> self = ResultAnalysis.demo(num=5000, mode='alt') + >>> self.analysis() + >>> print('self = {}'.format(self)) + >>> # xdoctest: +REQUIRES(module:kwplot) + >>> import kwplot + >>> kwplot.autompl() + >>> xlabel = 'x' + >>> metric_key = 'acc' + >>> group_labels = { + >>> 'col': ['y', 'w'], + >>> 'hue': ['z'], + >>> 'size': [], + >>> } + >>> self.plot(xlabel, metric_key, group_labels) + """ + import seaborn as sns + sns.set() + from matplotlib import pyplot as plt # NOQA + data = self.table + data = data.sort_values(metric_key) + for gname, labels in group_labels.items(): + if len(labels): + new_col = [] + for row in data[labels].to_dict('records'): + item = ub.repr2(row, compact=1, si=1) + new_col.append(item) + gkey = gname + "_key" + data[gkey] = new_col + + plotkw = {} + for gname, labels in group_labels.items(): + if labels: + plotkw[gname] = gname + "_key" + + # Your variables may change + # ax = plt.figure().gca() + col = plotkw.pop("col") + facet = sns.FacetGrid(data, col=col, sharex=False, sharey=False) + facet.map_dataframe(sns.lineplot, x=xlabel, y=metric_key, marker="o", **plotkw) + facet.add_legend() + class SkillTracker: """ @@ -677,6 +771,9 @@ class SkillTracker: 4: 0.20, 5: 0.20, } + + Requirements: + openskill """ def __init__(self, player_ids): From daf8913cc248ce5aae6e0292489230d5613adb7b Mon Sep 17 00:00:00 2001 From: joncrall Date: Thu, 26 May 2022 10:09:33 -0400 Subject: [PATCH 04/25] log scale --- tests/benchmark3.py | 19 +++++++++++++++++-- tests/benchmarker/result_analysis.py | 15 ++++++++++----- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/tests/benchmark3.py b/tests/benchmark3.py index b6e084e..2563d4d 100644 --- a/tests/benchmark3.py +++ b/tests/benchmark3.py @@ -217,13 +217,28 @@ def benchmark_json_dumps(): param_group = ['impl', 'impl_version'] analysis.abalate(param_group) - # benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL) + xlabel = "size" + # Set these to empty lists if they are not used + group_labels = { + "col": ["input"], + "hue": ["impl"], + "size": [], + } + import kwplot + kwplot.autompl() + facet = analysis.plot(xlabel, metric_key, group_labels) + for ax in facet.axes.ravel(): + ax.set_xscale('log') + ax.set_yscale('log') + print('facet = {!r}'.format(facet)) + kwplot.show_if_requested() + if __name__ == "__main__": """ CommandLine: - python ~/code/ultrajson/tests/benchmark3.py + python ~/code/ultrajson/tests/benchmark3.py --show """ benchmark_json_dumps() diff --git a/tests/benchmarker/result_analysis.py b/tests/benchmarker/result_analysis.py index fd56edb..d6f474e 100644 --- a/tests/benchmarker/result_analysis.py +++ b/tests/benchmarker/result_analysis.py @@ -240,9 +240,6 @@ class ResultAnalysis(ub.NiceRepr): self._description['num_results'] = len(self.results) def __nice__(self): - # if len(self._description) == 0: - # return 'unbuilt' - # else: return ub.repr2(self._description, si=1, sv=1) @classmethod @@ -405,8 +402,13 @@ class ResultAnalysis(ub.NiceRepr): def _objective_is_ascending(self, metric_key): """ - Return True if we should minimize the objective (lower is better) - Return False if we should maximize the objective (higher is better) + Args: + metric_key (str): the metric in question + + Returns: + bool: + True if we should minimize the objective (lower is better) + False if we should maximize the objective (higher is better) """ objective = self.metric_objectives.get(metric_key, None) if objective is None: @@ -578,6 +580,8 @@ class ResultAnalysis(ub.NiceRepr): comparable_groups2 = metric_group2.loc[comparable_indexes2, metric_key] # Does this need to have the values aligned? + # I think that is the case giving my understanding of paired + # t-tests, but the docs need a PR to make that more clear. ttest_rel_result = scipy.stats.ttest_rel(comparable_groups1, comparable_groups2) pair_statistics['n_common'] = len(common) pair_statistics['ttest_rel'] = ttest_rel_result @@ -746,6 +750,7 @@ class ResultAnalysis(ub.NiceRepr): facet = sns.FacetGrid(data, col=col, sharex=False, sharey=False) facet.map_dataframe(sns.lineplot, x=xlabel, y=metric_key, marker="o", **plotkw) facet.add_legend() + return facet class SkillTracker: From 68c4a55284318969a17ada11e6935935f007fe8f Mon Sep 17 00:00:00 2001 From: joncrall Date: Thu, 26 May 2022 16:03:23 -0400 Subject: [PATCH 05/25] Reorganize as separate module --- json_benchmarks/__init__.py | 0 json_benchmarks/__main__.py | 8 + json_benchmarks/benchmarker/__init__.py | 39 +++ .../benchmarker/aggregate.py | 6 +- .../benchmarker/benchmarker.py | 28 +- .../benchmarker/process_context.py | 17 +- .../benchmarker/result_analysis.py | 176 +++++++++---- .../benchmarker/util_json.py | 0 .../benchmarker/visualize.py | 0 json_benchmarks/core.py | 248 ++++++++++++++++++ json_benchmarks/datagen.py | 115 ++++++++ tests/benchmark3.py | 244 ----------------- tests/benchmarker/__init__.py | 35 --- 13 files changed, 568 insertions(+), 348 deletions(-) create mode 100644 json_benchmarks/__init__.py create mode 100644 json_benchmarks/__main__.py create mode 100644 json_benchmarks/benchmarker/__init__.py rename {tests => json_benchmarks}/benchmarker/aggregate.py (89%) rename {tests => json_benchmarks}/benchmarker/benchmarker.py (91%) rename {tests => json_benchmarks}/benchmarker/process_context.py (87%) rename {tests => json_benchmarks}/benchmarker/result_analysis.py (87%) rename {tests => json_benchmarks}/benchmarker/util_json.py (100%) rename {tests => json_benchmarks}/benchmarker/visualize.py (100%) create mode 100644 json_benchmarks/core.py create mode 100644 json_benchmarks/datagen.py delete mode 100644 tests/benchmark3.py delete mode 100644 tests/benchmarker/__init__.py diff --git a/json_benchmarks/__init__.py b/json_benchmarks/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/json_benchmarks/__main__.py b/json_benchmarks/__main__.py new file mode 100644 index 0000000..faf03f6 --- /dev/null +++ b/json_benchmarks/__main__.py @@ -0,0 +1,8 @@ + +if __name__ == '__main__': + """ + CommandLine: + python -m json_benchmarks + """ + from json_benchmarks import core + core.main() diff --git a/json_benchmarks/benchmarker/__init__.py b/json_benchmarks/benchmarker/__init__.py new file mode 100644 index 0000000..8abb4c5 --- /dev/null +++ b/json_benchmarks/benchmarker/__init__.py @@ -0,0 +1,39 @@ +""" +A helper module for executing, serializing, combining, and comparing benchmarks +""" + +__mkinit__ = """ +# Autogenerate this file +mkinit ~/code/ultrajson/json_benchmarks/benchmarker/__init__.py -w +""" + +__version__ = '0.1.0' + +from json_benchmarks.benchmarker import aggregate +from json_benchmarks.benchmarker import benchmarker +from json_benchmarks.benchmarker import process_context +from json_benchmarks.benchmarker import result_analysis +from json_benchmarks.benchmarker import util_json +from json_benchmarks.benchmarker import visualize + +from json_benchmarks.benchmarker.aggregate import (demo, demo_data,) +from json_benchmarks.benchmarker.benchmarker import (Benchmarker, + BenchmarkerConfig, + BenchmarkerResult, + combine_stats, + stats_dict,) +from json_benchmarks.benchmarker.process_context import (ProcessContext,) +from json_benchmarks.benchmarker.result_analysis import ( + DEFAULT_METRIC_TO_OBJECTIVE, Result, ResultAnalysis, SkillTracker,) +from json_benchmarks.benchmarker.util_json import (ensure_json_serializable, + find_json_unserializable, + indexable_allclose,) +from json_benchmarks.benchmarker.visualize import (benchmark_analysis,) + +__all__ = ['Benchmarker', 'BenchmarkerConfig', 'BenchmarkerResult', + 'DEFAULT_METRIC_TO_OBJECTIVE', 'ProcessContext', 'Result', + 'ResultAnalysis', 'SkillTracker', 'aggregate', 'benchmark_analysis', + 'benchmarker', 'combine_stats', 'demo', 'demo_data', + 'ensure_json_serializable', 'find_json_unserializable', + 'indexable_allclose', 'process_context', 'result_analysis', + 'stats_dict', 'util_json', 'visualize'] diff --git a/tests/benchmarker/aggregate.py b/json_benchmarks/benchmarker/aggregate.py similarity index 89% rename from tests/benchmarker/aggregate.py rename to json_benchmarks/benchmarker/aggregate.py index 41d11e8..b2d74c9 100644 --- a/tests/benchmarker/aggregate.py +++ b/json_benchmarks/benchmarker/aggregate.py @@ -4,7 +4,7 @@ import ubelt as ub def demo_data(): - from benchmarker.benchmarker import Benchmarker + from json_benchmarks.benchmarker.benchmarker import Benchmarker import numpy as np impl_lut = { 'numpy': np.sum, @@ -43,8 +43,8 @@ def demo_data(): def demo(): - from benchmarker import BenchmarkerResult - from benchmarker import result_analysis + from json_benchmarks.benchmarker import BenchmarkerResult + from json_benchmarks.benchmarker import result_analysis fpaths = demo_data() results = [] diff --git a/tests/benchmarker/benchmarker.py b/json_benchmarks/benchmarker/benchmarker.py similarity index 91% rename from tests/benchmarker/benchmarker.py rename to json_benchmarks/benchmarker/benchmarker.py index 1050d6a..6ff05d5 100644 --- a/tests/benchmarker/benchmarker.py +++ b/json_benchmarks/benchmarker/benchmarker.py @@ -3,7 +3,7 @@ import timerit import ubelt as ub import numpy as np from dataclasses import dataclass -from benchmarker.process_context import ProcessContext +from json_benchmarks.benchmarker.process_context import ProcessContext @dataclass @@ -49,7 +49,7 @@ class BenchmarkerResult: Returns: List[Result] """ - from benchmarker import result_analysis + from json_benchmarks.benchmarker import result_analysis results = [] for row in self.rows: result = result_analysis.Result( @@ -69,9 +69,6 @@ class Benchmarker: Helper to organize the execution and serialization of a benchmark Example: - >>> import sys, ubelt - >>> sys.path.append(ubelt.expandpath('~/code/ultrajson/tests')) - >>> from benchmarker.benchmarker import * # NOQA >>> import numpy as np >>> impl_lut = { >>> 'numpy': np.sum, @@ -205,11 +202,22 @@ def combine_stats(s1, s2): https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups """ stats = [s1, s2] - sizes = np.array([s['nobs'] for s in stats]) - means = np.array([s['mean'] for s in stats]) - stds = np.array([s['std'] for s in stats]) - mins = np.array([s['min'] for s in stats]) - maxs = np.array([s['max'] for s in stats]) + data = { + 'nobs': np.array([s['nobs'] for s in stats]), + 'mean': np.array([s['mean'] for s in stats]), + 'std': np.array([s['std'] for s in stats]), + 'min': np.array([s['min'] for s in stats]), + 'max': np.array([s['max'] for s in stats]), + } + combine_stats_arrs(data) + + +def combine_stats_arrs(data): + sizes = data['nobs'] + means = data['mean'] + stds = data['std'] + mins = data['min'] + maxs = data['max'] varis = stds * stds combo_size = sizes.sum() diff --git a/tests/benchmarker/process_context.py b/json_benchmarks/benchmarker/process_context.py similarity index 87% rename from tests/benchmarker/process_context.py rename to json_benchmarks/benchmarker/process_context.py index e198f9c..bce02c0 100644 --- a/tests/benchmarker/process_context.py +++ b/json_benchmarks/benchmarker/process_context.py @@ -9,11 +9,10 @@ class ProcessContext: Context manager to track the context under which a result was computed Example: - >>> import sys, ubelt - >>> sys.path.append(ubelt.expandpath('~/code/ultrajson/tests')) - >>> from benchmarker.process_context import * # NOQA + >>> from json_benchmarks.benchmarker.process_context import * # NOQA >>> self = ProcessContext() >>> obj = self.start().stop() + >>> print('obj = {}'.format(ub.repr2(obj, nl=2))) """ def __init__(self, name=None, args=None, config=None): @@ -69,15 +68,19 @@ class ProcessContext: 'mem_total': svmem_info.total, } - # def _cpuinfo(self): - # import cpuinfo - # cpu_info = cpuinfo.get_cpu_info() - # return cpu_info + def _cpuinfo(self): + import cpuinfo + _cpu_info = cpuinfo.get_cpu_info() + cpu_info = { + 'cpu_brand': _cpu_info['brand_raw'], + } + return cpu_info def _machine(self): return ub.dict_union( self._hostinfo(), self._meminfo(), + self._cpuinfo(), self._osinfo(), self._pyinfo(), ) diff --git a/tests/benchmarker/result_analysis.py b/json_benchmarks/benchmarker/result_analysis.py similarity index 87% rename from tests/benchmarker/result_analysis.py rename to json_benchmarks/benchmarker/result_analysis.py index d6f474e..e07d027 100644 --- a/tests/benchmarker/result_analysis.py +++ b/json_benchmarks/benchmarker/result_analysis.py @@ -89,10 +89,11 @@ class Result(ub.NiceRepr): # relationship between results two of the params. from scipy.special import expit params = { - 'w': rng.randint(-1, 1), - 'x': rng.randint(-3, 3), - 'y': rng.randint(-2, 2), - 'z': rng.randint(-3, 3), + 'u': rng.randint(0, 1 + 1), + 'v': rng.randint(-1, 1 + 1), + 'x': rng.randint(-2, 3 + 1), + 'y': rng.randint(-1, 2 + 1), + 'z': rng.randint(-0, 3 + 1), } noise = np.random.randn() * 1 r = 3 * params['x'] + params['y'] ** 2 + 0.3 * params['z'] ** 3 @@ -326,8 +327,28 @@ class ResultAnalysis(ub.NiceRepr): groups.append(group) return groups + def _objective_is_ascending(self, metric_key): + """ + Args: + metric_key (str): the metric in question + + Returns: + bool: + True if we should minimize the objective (lower is better) + False if we should maximize the objective (higher is better) + """ + objective = self.metric_objectives.get(metric_key, None) + if objective is None: + warnings.warn(f'warning assume {self.default_objective} for {metric_key=}') + objective = self.default_objective + ascending = (objective == 'min') + return ascending + def abalate(self, param_group): """ + TODO: + rectify with test-group + Example: >>> self = ResultAnalysis.demo(100) >>> param = 'param2' @@ -400,23 +421,6 @@ class ResultAnalysis(ub.NiceRepr): print(pd.DataFrame([pd.Series(pos_delta).describe().T])) return scored_obs - def _objective_is_ascending(self, metric_key): - """ - Args: - metric_key (str): the metric in question - - Returns: - bool: - True if we should minimize the objective (lower is better) - False if we should maximize the objective (higher is better) - """ - objective = self.metric_objectives.get(metric_key, None) - if objective is None: - warnings.warn(f'warning assume {self.default_objective} for {metric_key=}') - objective = self.default_objective - ascending = (objective == 'min') - return ascending - def test_group(self, param_group, metric_key): """ Get stats for a particular metric / constant group @@ -477,8 +481,10 @@ class ResultAnalysis(ub.NiceRepr): # Determine a set of value pairs to do pairwise comparisons on value_pairs = ub.oset() - value_pairs.update(map(frozenset, ub.iter_window(moments.index, 2))) - value_pairs.update(map(frozenset, ub.iter_window(moments.sort_values('mean', ascending=ascending).index, 2))) + # value_pairs.update( + # map(frozenset, ub.iter_window(moments.index, 2))) + value_pairs.update(map(frozenset, ub.iter_window( + moments.sort_values('mean', ascending=ascending).index, 2))) # https://en.wikipedia.org/wiki/Kruskal%E2%80%93Wallis_one-way_analysis_of_variance # If the researcher can make the assumptions of an identically @@ -508,9 +514,9 @@ class ResultAnalysis(ub.NiceRepr): stats_row['anova_mean_p'] = anova_1way_result.pvalue stats_row['moments'] = moments - pairwise_statistics = [] + pair_stats_list = [] for pair in value_pairs: - pair_statistics = {} + pair_stats = {} param_val1, param_val2 = pair metric_vals1 = value_to_metric[param_val1] @@ -518,11 +524,11 @@ class ResultAnalysis(ub.NiceRepr): rank1 = param_to_rank[param_val1] rank2 = param_to_rank[param_val2] - pair_statistics['winner'] = param_val1 if rank1 < rank2 else param_val2 - pair_statistics['value1'] = param_val1 - pair_statistics['value2'] = param_val2 - pair_statistics['n1'] = len(metric_vals1) - pair_statistics['n2'] = len(metric_vals2) + pair_stats['winner'] = param_val1 if rank1 < rank2 else param_val2 + pair_stats['value1'] = param_val1 + pair_stats['value2'] = param_val2 + pair_stats['n1'] = len(metric_vals1) + pair_stats['n2'] = len(metric_vals2) TEST_ONLY_FOR_DIFFERENCE = True if TEST_ONLY_FOR_DIFFERENCE: @@ -554,7 +560,7 @@ class ResultAnalysis(ub.NiceRepr): scipy.stats.ttest_ind_from_stats - pair_statistics['ttest_ind'] = ttest_ind_result + pair_stats['ttest_ind'] = ttest_ind_result # Do relative checks, need to find comparable subgroups metric_group1 = value_to_metric_group[param_val1] @@ -583,11 +589,11 @@ class ResultAnalysis(ub.NiceRepr): # I think that is the case giving my understanding of paired # t-tests, but the docs need a PR to make that more clear. ttest_rel_result = scipy.stats.ttest_rel(comparable_groups1, comparable_groups2) - pair_statistics['n_common'] = len(common) - pair_statistics['ttest_rel'] = ttest_rel_result - pairwise_statistics.append(pair_statistics) + pair_stats['n_common'] = len(common) + pair_stats['ttest_rel'] = ttest_rel_result + pair_stats_list.append(pair_stats) - stats_row['pairwise'] = pairwise_statistics + stats_row['pairwise'] = pair_stats_list return stats_row def build(self): @@ -671,8 +677,10 @@ class ResultAnalysis(ub.NiceRepr): # Rougly speaking print('') print(f'ANOVA: If p is low, the param {param_name!r} might have an effect') - print(ub.color_text(f' Rank-ANOVA: p={anova_rank_p:0.8f}', 'green' if anova_rank_p < p_threshold else None)) - print(ub.color_text(f' Mean-ANOVA: p={anova_mean_p:0.8f}', 'green' if anova_mean_p < p_threshold else None)) + print(ub.color_text(f' Rank-ANOVA: p={anova_rank_p:0.8f}', + 'green' if anova_rank_p < p_threshold else None)) + print(ub.color_text(f' Mean-ANOVA: p={anova_mean_p:0.8f}', + 'green' if anova_mean_p < p_threshold else None)) print('') print('Pairwise T-Tests') for pairstat in stats_row['pairwise']: @@ -685,11 +693,13 @@ class ResultAnalysis(ub.NiceRepr): print(f' If p is low, {param_name}={value1} may outperform {param_name}={value2}.') if 'ttest_ind' in pairstat: ttest_ind_result = pairstat['ttest_ind'] - print(ub.color_text(f' ttest_ind: p={ttest_ind_result.pvalue:0.8f}', 'green' if ttest_ind_result.pvalue < p_threshold else None)) + print(ub.color_text(f' ttest_ind: p={ttest_ind_result.pvalue:0.8f}', + 'green' if ttest_ind_result.pvalue < p_threshold else None)) if 'ttest_rel' in pairstat: n_common = pairstat['n_common'] ttest_rel_result = pairstat['ttest_ind'] - print(ub.color_text(f' ttest_rel: p={ttest_rel_result.pvalue:0.8f}, n_pairs={n_common}', 'green' if ttest_rel_result.pvalue < p_threshold else None)) + print(ub.color_text(f' ttest_rel: p={ttest_rel_result.pvalue:0.8f}, n_pairs={n_common}', + 'green' if ttest_rel_result.pvalue < p_threshold else None)) def conclusions(self): conclusions = [] @@ -709,17 +719,31 @@ class ResultAnalysis(ub.NiceRepr): def plot(self, xlabel, metric_key, group_labels): """ + Args: + group_labels (dict): + Tells seaborn what attributes to use to distinsuish curves like + hue, size, marker. Also can contain "col" for use with + FacetGrid, and "fig" to separate different configurations into + different figures. + + Returns: + List[Dict]: + A list for each figure containing info abou that figure for any + postprocessing. + Example: - >>> self = ResultAnalysis.demo(num=5000, mode='alt') + >>> self = ResultAnalysis.demo(num=1000, mode='alt') >>> self.analysis() >>> print('self = {}'.format(self)) + >>> print('self.varied = {}'.format(ub.repr2(self.varied, nl=1))) >>> # xdoctest: +REQUIRES(module:kwplot) >>> import kwplot - >>> kwplot.autompl() + >>> kwplot.autosns() >>> xlabel = 'x' >>> metric_key = 'acc' >>> group_labels = { - >>> 'col': ['y', 'w'], + >>> 'fig': ['u'], + >>> 'col': ['y', 'v'], >>> 'hue': ['z'], >>> 'size': [], >>> } @@ -739,18 +763,72 @@ class ResultAnalysis(ub.NiceRepr): gkey = gname + "_key" data[gkey] = new_col - plotkw = {} + plot_kws = { + 'x': xlabel, + 'y': metric_key, + } for gname, labels in group_labels.items(): if labels: - plotkw[gname] = gname + "_key" + plot_kws[gname] = gname + "_key" # Your variables may change # ax = plt.figure().gca() - col = plotkw.pop("col") - facet = sns.FacetGrid(data, col=col, sharex=False, sharey=False) - facet.map_dataframe(sns.lineplot, x=xlabel, y=metric_key, marker="o", **plotkw) - facet.add_legend() - return facet + fig_params = plot_kws.pop("fig", []) + + facet_kws = { + 'sharex': False, + 'sharey': False, + } + # facet_kws['col'] = plot_kws.pop("col", None) + # facet_kws['row'] = plot_kws.pop("row", None) + # if not facet_kws['row']: + # facet_kws['col_wrap'] = 5 + plot_kws['row'] = plot_kws.get("row", None) + # if not plot_kws['row']: + # plot_kws['col_wrap'] = 5 + + if not fig_params: + groups = [('', data)] + else: + groups = data.groupby(fig_params) + + if 'marker' not in plot_kws: + plot_kws['marker'] = "o" + + plot_kws['ci'] = "sd" + + # Use a consistent pallete across plots + unique_hues = data['hue_key'].unique() + palette = ub.dzip(unique_hues, sns.color_palette(n_colors=len(unique_hues))) + plot_kws['palette'] = palette + + plots = [] + base_fnum = 1 + for fnum, (fig_key, group) in enumerate(groups, start=base_fnum): + # TODO: seaborn doesn't give us any option to reuse an existing + # figure or even specify what it's handle should be. A patch should + # be submitted to add that feature, but in the meantime work around + # it and use the figures they give us. + + # fig = plt.figure(fnum) + # fig.clf() + + facet = sns.relplot( + data=group, kind='line', + facet_kws=facet_kws, + **plot_kws) + + fig = facet.figure + fig.suptitle(fig_key) + fig.tight_layout() + # facet = sns.FacetGrid(group, **facet_kws) + # facet.map_dataframe(sns.lineplot, x=xlabel, y=metric_key, **plot_kws) + # facet.add_legend() + plots.append({ + 'fig': fig, + 'facet': facet, + }) + return plots class SkillTracker: diff --git a/tests/benchmarker/util_json.py b/json_benchmarks/benchmarker/util_json.py similarity index 100% rename from tests/benchmarker/util_json.py rename to json_benchmarks/benchmarker/util_json.py diff --git a/tests/benchmarker/visualize.py b/json_benchmarks/benchmarker/visualize.py similarity index 100% rename from tests/benchmarker/visualize.py rename to json_benchmarks/benchmarker/visualize.py diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py new file mode 100644 index 0000000..404a9f8 --- /dev/null +++ b/json_benchmarks/core.py @@ -0,0 +1,248 @@ +""" +Main definition of the benchmarks +""" +import json +import ubelt as ub +import scriptconfig as scfg +from json_benchmarks import benchmarker +from json_benchmarks import datagen + +KNOWN_LIBRARIES = [ + "ujson", + "nujson", + "orjson", + "simplejson", + "json", +] + + +class JSONBenchmarkConfig(scfg.Config): + """ + Benchmark JSON implementations + """ + default = { + 'disable': scfg.Value([], choices=KNOWN_LIBRARIES, help=ub.paragraph( + ''' + Remove specified libraries from the benchmarks + ''' + )), + + 'factor': scfg.Value(1.0, help=ub.paragraph( + ''' + Specify as a fraction to speed up benchmarks for development / + testing + ''')), + + 'cache_dir': scfg.Value(None, help=ub.paragraph( + ''' + Location for benchmark cache. + Defaults to $XDG_CACHE/ujson/benchmark_results/ + ''')), + } + + def normalize(self): + dpath = self['cache_dir'] + if dpath is None: + dpath = ub.Path.appdir('ujson/benchmark_results') + dpath = ub.Path(dpath) + self['cache_dir'] = dpath + + +def available_json_impls(): + import importlib + known_modnames = [ + 'ujson', 'json', 'nujson', 'orjson', 'simplejson' + ] + json_impls = {} + for libname in known_modnames: + try: + module = importlib.import_module(libname) + except ImportError: + pass + else: + json_impls[libname] = { + 'module': module, + 'version': module.__version__, + } + return json_impls + + +def benchmark_json(): + json_impls = available_json_impls() + + data_lut = datagen.json_test_data_generators() + + # These are the parameters that we benchmark over + basis = { + "input": [ + 'Array with doubles', + 'Array with UTF-8 strings', + # 'Medium complex object', + 'Array with True values', + 'Array of Dict[str, int]', + # 'Dict of List[Dict[str, int]]', + # 'Complex object' + ], + "size": [1, 2, 4, 8, 16, 32, 128, 256, 512, 1024, 2048, 4096, 8192, 12288], + "impl": list(json_impls.keys()), + "func": ['dumps', 'loads'], + } + + # The Benchmarker class is a new experimental API around timerit to + # abstract away the details of timing a process over a grid of parameters, + # serializing the results, and aggregating results from disparate runs. + benchmark = benchmarker.Benchmarker( + name='bench_json', + num=100, + bestof=10, + verbose=2, + basis=basis, + ) + + # For each variation of your experiment, create a row. + for params in benchmark.iter_params(): + # Make any modifications you need to compute input kwargs for each + # method here. + impl_info = json_impls[params["impl"]] + params["impl_version"] = impl_info['version'] + module = impl_info['module'] + if params['func'] == 'dumps': + method = module.dumps + data = data_lut[params["input"]](params["size"]) + elif params['func'] == 'loads': + method = module.loads + to_encode = data_lut[params["input"]](params["size"]) + data = json.dumps(to_encode) + # Timerit will run some user-specified number of loops. + # and compute time stats with similar methodology to timeit + for timer in benchmark.measure(): + # Put any setup logic you dont want to time here. + # ... + with timer: + # Put the logic you want to time here + method(data) + + dpath = ub.Path.appdir('ujson/benchmark_results').ensuredir() + result_fpath = benchmark.dump_in_dpath(dpath) + return result_fpath + + +def aggregate_results(result_fpaths): + import json + results = [] + for fpath in result_fpaths: + data = json.loads(fpath.read_text()) + for row in data['rows']: + result = benchmarker.BenchmarkerResult.load(fpath) + results.extend(result.to_result_list()) + + RECORD_ALL = 0 + metric_key = "time" if RECORD_ALL else "mean_time" + + # results = benchmark.result.to_result_list() + + analysis = benchmarker.result_analysis.ResultAnalysis( + results, + metrics=[metric_key], + params=['impl'], + metric_objectives={ + 'min_time': 'min', + 'mean_time': 'min', + 'time': 'min', + }) + analysis.analysis() + + table = analysis.table + + def aggregate_time_stats(data, group_keys=None): + """ + Given columns interpreted as containing stats, aggregate those stats + within each group. For each row, any non-group, non-stat column + with consistent values across that columns in the group is kept as-is, + otherwise the new column for that row is set to None. + """ + import pandas as pd + # Stats groupings + stats_cols = [ + 'nobs_time', + 'std_time', + 'mean_time', + 'max_time', + 'min_time', + ] + mapper = {c: c.replace('_time', '') for c in stats_cols} + unmapper = ub.invert_dict(mapper) + non_stats_cols = list(ub.oset(data.columns) - stats_cols) + if group_keys is None: + group_keys = non_stats_cols + non_group_keys = list(ub.oset(non_stats_cols) - group_keys) + from json_benchmarks.benchmarker.benchmarker import combine_stats_arrs + new_rows = [] + for group_vals, group in list(data.groupby(group_keys)): + # hack, is this a pandas bug in 1.4.1? Is it fixed + if isinstance(group_keys, list) and not isinstance(group_vals, list): + group_vals = [group_vals] + stat_data = group[stats_cols].rename(mapper, axis=1) + new_stats = combine_stats_arrs(stat_data) + new_time_stats = ub.map_keys(unmapper, new_stats) + new_row = ub.dzip(group_keys, group_vals) + if non_group_keys: + for k in non_group_keys: + unique_vals = group[k].unique() + if len(unique_vals) == 1: + new_row[k] = unique_vals[0] + else: + new_row[k] = None + new_row.update(new_time_stats) + new_rows.append(new_row) + new_data = pd.DataFrame(new_rows) + return new_data + + single_size = table[table['size'] == 256] + # single_size_combo = aggregate_time_stats(single_size, None) + single_size_combo = aggregate_time_stats(single_size, ['name']) + + param_group = ['impl', 'impl_version'] + single_size_combo['calls/sec'] = 1 / single_size_combo['mean_time'] + _single_size_combo = single_size_combo.copy() + _single_size_combo['calls/sec'] = _single_size_combo['calls/sec'].apply(lambda x: '{:,.02f}'.format(x)) + piv = _single_size_combo.pivot('input', param_group, 'calls/sec') + print(piv) + + analysis.abalate(param_group) + # benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL) + + xlabel = "size" + # Set these to empty lists if they are not used + group_labels = { + "fig": ["input"], + "hue": ["impl", "impl_version"], + "size": [], + } + import kwplot + kwplot.autosns() + plots = analysis.plot(xlabel, metric_key, group_labels) + for plot in plots: + for ax in plot['facet'].axes.ravel(): + ax.set_xscale('log') + ax.set_yscale('log') + kwplot.show_if_requested() + + +def main(): + from json_benchmarks import core + config = core.JSONBenchmarkConfig(cmdline=True) + dpath = config['cache_dir'] + + run = 1 + if run: + result_fpath = core.benchmark_json() + print('result_fpath = {!r}'.format(result_fpath)) + result_fpaths = [result_fpath] + + agg = 1 + if agg: + result_fpaths = list(dpath.glob('benchmarks*.json')) + + core.aggregate_results(result_fpaths) + # results_output_table(libraries) diff --git a/json_benchmarks/datagen.py b/json_benchmarks/datagen.py new file mode 100644 index 0000000..ff27d6c --- /dev/null +++ b/json_benchmarks/datagen.py @@ -0,0 +1,115 @@ +import random +import sys + + +def json_test_data_generators(): + """ + Generates data for benchmarks with various sizes + + Returns: + Dict[str, callable]: + a mapping from test data name to its generator + + Example: + >>> data_lut = json_test_data_generators() + >>> size = 2 + >>> keys = sorted(set(data_lut) - {'Complex object'}) + >>> for key in keys: + >>> func = data_lut[key] + >>> test_object = func(size) + >>> print('key = {!r}'.format(key)) + >>> print('test_object = {!r}'.format(test_object)) + """ + data_lut = {} + def _register_data(name): + def _wrap(func): + data_lut[name] = func + return _wrap + + # seed if desired + # rng = random.Random(0) + rng = random + + @_register_data('Array with doubles') + def array_with_doubles(size): + test_object = [sys.maxsize * rng.random() for _ in range(size)] + return test_object + + @_register_data('Array with UTF-8 strings') + def array_with_utf8_strings(size): + utf8_string = ( + "نظام الحكم سلطاني وراثي " + "في الذكور من ذرية السيد تركي بن سعيد بن سلطان ويشترط فيمن يختار لولاية" + " الحكم من بينهم ان يكون مسلما رشيدا عاقلا ًوابنا شرعيا لابوين عمانيين " + ) + test_object = [utf8_string for _ in range(size)] + return test_object + + @_register_data('Medium complex object') + def medium_complex_object(size): + user = { + "userId": 3381293, + "age": 213, + "username": "johndoe", + "fullname": "John Doe the Second", + "isAuthorized": True, + "liked": 31231.31231202, + "approval": 31.1471, + "jobs": [1, 2], + "currJob": None, + } + friends = [user, user, user, user, user, user, user, user] + test_object = [[user, friends] for _ in range(size)] + return test_object + + @_register_data('Array with True values') + def true_values(size): + test_object = [True for _ in range(size)] + return test_object + + @_register_data('Array of Dict[str, int]') + def array_of_dict_string_int(size): + test_object = [ + {str(rng.random() * 20): int(rng.random() * 1000000)} + for _ in range(size) + ] + return test_object + + @_register_data('Dict of List[Dict[str, int]]') + def dict_of_list_dict_str_int(size): + keys = set() + while len(keys) < size: + key = str(rng.random() * 20) + keys.add(key) + test_object = { + key: [ + {str(rng.random() * 20): int(rng.random() * 1000000)} + for _ in range(256) + ] + for key in keys + } + return test_object + + @_register_data('Complex object') + def complex_object(size): + import json + # TODO: might be better to reigster this file with setup.py or + # download it via some mechanism + try: + dpath = ub.Path(__file__).parent + fpath = dpath / 'sample.json' + if not fpath.exists(): + raise Exception + except Exception: + import ujson + dpath = ub.Path(ujson.__file__).parent / 'tests' + fpath = dpath / 'sample.json' + if not fpath.exists(): + raise Exception + with open(fpath, 'r') as f: + test_object = json.load(f) + if size > 1: + test_object = [test_object] * size + return test_object + + return data_lut diff --git a/tests/benchmark3.py b/tests/benchmark3.py deleted file mode 100644 index 2563d4d..0000000 --- a/tests/benchmark3.py +++ /dev/null @@ -1,244 +0,0 @@ -""" -Roadmap: - - - [ ] -""" -import random -import sys -import ubelt as ub - - -def json_test_data_generators(): - """ - Generates data for benchmarks with various sizes - - Returns: - Dict[str, callable]: - a mapping from test data name to its generator - - Example: - >>> data_lut = json_test_data_generators() - >>> size = 2 - >>> keys = sorted(set(data_lut) - {'Complex object'}) - >>> for key in keys: - >>> func = data_lut[key] - >>> test_object = func(size) - >>> print('key = {!r}'.format(key)) - >>> print('test_object = {!r}'.format(test_object)) - """ - data_lut = {} - def _register_data(name): - def _wrap(func): - data_lut[name] = func - return _wrap - - # seed if desired - #rng = random.Random() - rng = random - - @_register_data('Array with doubles') - def array_with_doubles(size): - test_object = [sys.maxsize * rng.random() for _ in range(size)] - return test_object - - @_register_data('Array with UTF-8 strings') - def array_with_utf8_strings(size): - utf8_string = ( - "نظام الحكم سلطاني وراثي " - "في الذكور من ذرية السيد تركي بن سعيد بن سلطان ويشترط فيمن يختار لولاية" - " الحكم من بينهم ان يكون مسلما رشيدا عاقلا ًوابنا شرعيا لابوين عمانيين " - ) - test_object = [utf8_string for _ in range(size)] - return test_object - - @_register_data('Medium complex object') - def medium_complex_object(size): - user = { - "userId": 3381293, - "age": 213, - "username": "johndoe", - "fullname": "John Doe the Second", - "isAuthorized": True, - "liked": 31231.31231202, - "approval": 31.1471, - "jobs": [1, 2], - "currJob": None, - } - friends = [user, user, user, user, user, user, user, user] - test_object = [[user, friends] for _ in range(size)] - return test_object - - @_register_data('Array with True values') - def true_values(size): - test_object = [True for _ in range(size)] - return test_object - - @_register_data('Array of Dict[str, int]') - def array_of_dict_string_int(size): - test_object = [ - {str(rng.random() * 20): int(rng.random() * 1000000)} - for _ in range(size) - ] - return test_object - - @_register_data('Dict of List[Dict[str, int]]') - def dict_of_list_dict_str_int(size): - keys = set() - while len(keys) < size: - key = str(rng.random() * 20) - keys.add(key) - test_object = { - key: [ - {str(rng.random() * 20): int(rng.random() * 1000000)} - for _ in range(256) - ] - for key in keys - } - return test_object - - @_register_data('Complex object') - def complex_object(size): - import json - # TODO: might be better to reigster this file with setup.py or - # download it via some mechanism - try: - dpath = ub.Path(__file__).parent - fpath = dpath / 'sample.json' - if not fpath.exists(): - raise Exception - except Exception: - import ujson - dpath = ub.Path(ujson.__file__).parent / 'tests' - fpath = dpath / 'sample.json' - if not fpath.exists(): - raise Exception - with open(fpath, 'r') as f: - test_object = json.load(f) - if size > 1: - test_object = [test_object] * size - return test_object - - return data_lut - - -def available_json_impls(): - import importlib - known_modnames = [ - 'ujson', 'json', 'nujson', 'orjson', 'simplejson' - ] - json_impls = {} - for libname in known_modnames: - try: - module = importlib.import_module(libname) - except ImportError: - pass - else: - json_impls[libname] = { - 'module': module, - 'version': module.__version__, - } - return json_impls - - -def benchmark_json_dumps(): - # TODO: remove this hack - sys.path.append(ub.expandpath('~/code/ultrajson/tests')) - from benchmarker import Benchmarker - - json_impls = available_json_impls() - data_lut = json_test_data_generators() - - list(data_lut.keys()) - - # These are the parameters that we benchmark over - basis = { - "input": [ - 'Array with doubles', - 'Array with UTF-8 strings', - # 'Medium complex object', - 'Array with True values', - 'Array of Dict[str, int]', - # 'Dict of List[Dict[str, int]]', - # 'Complex object' - ], - "size": [1, 32, 256, 1024, 2048], - "impl": list(json_impls.keys()), - } - - # The Benchmarker class is a new experimental API around timerit to - # abstract away the details of timing a process over a grid of parameters, - # serializing the results, and aggregating results from disparate runs. - benchmark = Benchmarker( - name='bench_json_dumps', - num=100, - bestof=10, - verbose=2, - basis=basis, - ) - - # For each variation of your experiment, create a row. - for params in benchmark.iter_params(): - # Make any modifications you need to compute input kwargs for each - # method here. - impl_info = json_impls[params["impl"]] - method = impl_info['module'].dumps - impl_version = impl_info['version'] - params["impl_version"] = impl_version - data = data_lut[params["input"]](params["size"]) - # Timerit will run some user-specified number of loops. - # and compute time stats with similar methodology to timeit - for timer in benchmark.measure(): - # Put any setup logic you dont want to time here. - # ... - with timer: - # Put the logic you want to time here - method(data) - - dpath = ub.Path.appdir('ujson/benchmark_results').ensuredir() - benchmark.dump_in_dpath(dpath) - - RECORD_ALL = 0 - metric_key = "time" if RECORD_ALL else "mean_time" - - from benchmarker import result_analysis - results = benchmark.result.to_result_list() - - analysis = result_analysis.ResultAnalysis( - results, - metrics=[metric_key], - params=['impl'], - metric_objectives={ - 'min_time': 'min', - 'mean_time': 'min', - 'time': 'min', - }) - analysis.analysis() - analysis.table - - param_group = ['impl', 'impl_version'] - analysis.abalate(param_group) - # benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL) - - xlabel = "size" - # Set these to empty lists if they are not used - group_labels = { - "col": ["input"], - "hue": ["impl"], - "size": [], - } - import kwplot - kwplot.autompl() - facet = analysis.plot(xlabel, metric_key, group_labels) - for ax in facet.axes.ravel(): - ax.set_xscale('log') - ax.set_yscale('log') - print('facet = {!r}'.format(facet)) - kwplot.show_if_requested() - - -if __name__ == "__main__": - """ - CommandLine: - python ~/code/ultrajson/tests/benchmark3.py --show - """ - benchmark_json_dumps() diff --git a/tests/benchmarker/__init__.py b/tests/benchmarker/__init__.py deleted file mode 100644 index 1d04095..0000000 --- a/tests/benchmarker/__init__.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -A helper module for executing, serializing, combining, and comparing benchmarks -""" - -__mkinit__ = """ -# Autogenerate this file -mkinit ~/code/ultrajson/tests/benchmarker/__init__.py -w -""" - -__version__ = '0.1.0' - -from benchmarker import aggregate -from benchmarker import benchmarker -from benchmarker import process_context -from benchmarker import result_analysis -from benchmarker import util_json -from benchmarker import visualize - -from benchmarker.aggregate import (demo, demo_data,) -from benchmarker.benchmarker import (Benchmarker, BenchmarkerConfig, - BenchmarkerResult, combine_stats, - stats_dict,) -from benchmarker.process_context import (ProcessContext,) -from benchmarker.result_analysis import (Result, ResultAnalysis, SkillTracker,) -from benchmarker.util_json import (ensure_json_serializable, - find_json_unserializable, - indexable_allclose,) -from benchmarker.visualize import (benchmark_analysis,) - -__all__ = ['Benchmarker', 'BenchmarkerConfig', 'BenchmarkerResult', - 'ProcessContext', 'Result', 'ResultAnalysis', 'SkillTracker', - 'aggregate', 'benchmark_analysis', 'benchmarker', 'combine_stats', - 'demo', 'demo_data', 'ensure_json_serializable', - 'find_json_unserializable', 'indexable_allclose', 'process_context', - 'result_analysis', 'stats_dict', 'util_json', 'visualize'] From a89bc27ff518ccf5f5c878dc649fa9d9533fec1e Mon Sep 17 00:00:00 2001 From: joncrall Date: Thu, 26 May 2022 16:27:26 -0400 Subject: [PATCH 06/25] add support for loads and complex object bench --- json_benchmarks/benchmarker/benchmarker.py | 9 ++++- .../benchmarker/result_analysis.py | 4 +-- json_benchmarks/core.py | 36 +++++++++++++++---- json_benchmarks/datagen.py | 3 +- 4 files changed, 41 insertions(+), 11 deletions(-) diff --git a/json_benchmarks/benchmarker/benchmarker.py b/json_benchmarks/benchmarker/benchmarker.py index 6ff05d5..c59a28f 100644 --- a/json_benchmarks/benchmarker/benchmarker.py +++ b/json_benchmarks/benchmarker/benchmarker.py @@ -121,7 +121,14 @@ class Benchmarker: def iter_params(self): self.context.start() - grid_iter = list(ub.named_product(self.basis)) + if isinstance(self.basis, dict): + grid_iter = ub.named_product(self.basis) + else: + grid_iter = ub.flatten([ + ub.named_product(b) + for b in self.basis + ]) + for params in grid_iter: self.params = params self.key = ub.repr2(params, compact=1, si=1) diff --git a/json_benchmarks/benchmarker/result_analysis.py b/json_benchmarks/benchmarker/result_analysis.py index e07d027..5decbea 100644 --- a/json_benchmarks/benchmarker/result_analysis.py +++ b/json_benchmarks/benchmarker/result_analysis.py @@ -776,8 +776,8 @@ class ResultAnalysis(ub.NiceRepr): fig_params = plot_kws.pop("fig", []) facet_kws = { - 'sharex': False, - 'sharey': False, + 'sharex': True, + 'sharey': True, } # facet_kws['col'] = plot_kws.pop("col", None) # facet_kws['row'] = plot_kws.pop("row", None) diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py index 404a9f8..1bf6e65 100644 --- a/json_benchmarks/core.py +++ b/json_benchmarks/core.py @@ -73,7 +73,11 @@ def benchmark_json(): data_lut = datagen.json_test_data_generators() # These are the parameters that we benchmark over - basis = { + common_basis = { + "impl": list(json_impls.keys()), + "func": ['dumps', 'loads'], + } + sized_basis = { "input": [ 'Array with doubles', 'Array with UTF-8 strings', @@ -83,10 +87,20 @@ def benchmark_json(): # 'Dict of List[Dict[str, int]]', # 'Complex object' ], - "size": [1, 2, 4, 8, 16, 32, 128, 256, 512, 1024, 2048, 4096, 8192, 12288], - "impl": list(json_impls.keys()), - "func": ['dumps', 'loads'], + "size": [1, 2, 4, 8, 16, 32, 128, 256, 512], + # 1024, 2048, 4096, 8192, 12288], } + predefined_basis = { + "input": [ + 'Complex object' + ], + 'size': [None], + } + + basis = [ + ub.dict_union(common_basis, predefined_basis), + ub.dict_union(common_basis, sized_basis), + ] # The Benchmarker class is a new experimental API around timerit to # abstract away the details of timing a process over a grid of parameters, @@ -95,12 +109,18 @@ def benchmark_json(): name='bench_json', num=100, bestof=10, - verbose=2, + verbose=3, basis=basis, ) + def is_blocked(params): + if params['input'] == 'Complex object' and params['impl'] == 'orjson': + return True + # For each variation of your experiment, create a row. for params in benchmark.iter_params(): + if is_blocked(params): + continue # Make any modifications you need to compute input kwargs for each # method here. impl_info = json_impls[params["impl"]] @@ -198,7 +218,7 @@ def aggregate_results(result_fpaths): new_data = pd.DataFrame(new_rows) return new_data - single_size = table[table['size'] == 256] + single_size = table[(table['size'] == 256) | table['size'].isnull()] # single_size_combo = aggregate_time_stats(single_size, None) single_size_combo = aggregate_time_stats(single_size, ['name']) @@ -206,7 +226,8 @@ def aggregate_results(result_fpaths): single_size_combo['calls/sec'] = 1 / single_size_combo['mean_time'] _single_size_combo = single_size_combo.copy() _single_size_combo['calls/sec'] = _single_size_combo['calls/sec'].apply(lambda x: '{:,.02f}'.format(x)) - piv = _single_size_combo.pivot('input', param_group, 'calls/sec') + piv = _single_size_combo.pivot(['input', 'func'], param_group, 'calls/sec') + print('Table for size=256') print(piv) analysis.abalate(param_group) @@ -216,6 +237,7 @@ def aggregate_results(result_fpaths): # Set these to empty lists if they are not used group_labels = { "fig": ["input"], + "col": ["func"], "hue": ["impl", "impl_version"], "size": [], } diff --git a/json_benchmarks/datagen.py b/json_benchmarks/datagen.py index ff27d6c..afb2708 100644 --- a/json_benchmarks/datagen.py +++ b/json_benchmarks/datagen.py @@ -1,5 +1,6 @@ import random import sys +import ubelt as ub def json_test_data_generators(): @@ -108,7 +109,7 @@ def json_test_data_generators(): raise Exception with open(fpath, 'r') as f: test_object = json.load(f) - if size > 1: + if size is not None: test_object = [test_object] * size return test_object From 4d0f705d6d24f71a9bc0af0812ad7d6d51faa5c5 Mon Sep 17 00:00:00 2001 From: joncrall Date: Fri, 27 May 2022 10:15:48 -0400 Subject: [PATCH 07/25] wip --- json_benchmarks/benchmarker/result_analysis.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/json_benchmarks/benchmarker/result_analysis.py b/json_benchmarks/benchmarker/result_analysis.py index 5decbea..2b270a5 100644 --- a/json_benchmarks/benchmarker/result_analysis.py +++ b/json_benchmarks/benchmarker/result_analysis.py @@ -795,7 +795,10 @@ class ResultAnalysis(ub.NiceRepr): if 'marker' not in plot_kws: plot_kws['marker'] = "o" + # We will want to overwrite this with our own std estimate plot_kws['ci'] = "sd" + # err_style='band', + # err_kws=None, # Use a consistent pallete across plots unique_hues = data['hue_key'].unique() @@ -814,7 +817,9 @@ class ResultAnalysis(ub.NiceRepr): # fig.clf() facet = sns.relplot( - data=group, kind='line', + data=group, + # kind='line', + kind='scatter', facet_kws=facet_kws, **plot_kws) From 3159c0088990f3369bb99e062df15986952fd9dd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 27 May 2022 14:19:02 +0000 Subject: [PATCH 08/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- json_benchmarks/__main__.py | 4 +- json_benchmarks/benchmarker/__init__.py | 80 ++-- json_benchmarks/benchmarker/aggregate.py | 52 +-- json_benchmarks/benchmarker/benchmarker.py | 107 ++--- .../benchmarker/process_context.py | 77 ++-- .../benchmarker/result_analysis.py | 379 +++++++++++------- json_benchmarks/benchmarker/util_json.py | 41 +- json_benchmarks/benchmarker/visualize.py | 8 +- json_benchmarks/core.py | 147 ++++--- json_benchmarks/datagen.py | 30 +- 10 files changed, 544 insertions(+), 381 deletions(-) diff --git a/json_benchmarks/__main__.py b/json_benchmarks/__main__.py index faf03f6..c4ddc30 100644 --- a/json_benchmarks/__main__.py +++ b/json_benchmarks/__main__.py @@ -1,8 +1,8 @@ - -if __name__ == '__main__': +if __name__ == "__main__": """ CommandLine: python -m json_benchmarks """ from json_benchmarks import core + core.main() diff --git a/json_benchmarks/benchmarker/__init__.py b/json_benchmarks/benchmarker/__init__.py index 8abb4c5..ee32ea0 100644 --- a/json_benchmarks/benchmarker/__init__.py +++ b/json_benchmarks/benchmarker/__init__.py @@ -7,33 +7,59 @@ __mkinit__ = """ mkinit ~/code/ultrajson/json_benchmarks/benchmarker/__init__.py -w """ -__version__ = '0.1.0' +__version__ = "0.1.0" -from json_benchmarks.benchmarker import aggregate -from json_benchmarks.benchmarker import benchmarker -from json_benchmarks.benchmarker import process_context -from json_benchmarks.benchmarker import result_analysis -from json_benchmarks.benchmarker import util_json -from json_benchmarks.benchmarker import visualize - -from json_benchmarks.benchmarker.aggregate import (demo, demo_data,) -from json_benchmarks.benchmarker.benchmarker import (Benchmarker, - BenchmarkerConfig, - BenchmarkerResult, - combine_stats, - stats_dict,) -from json_benchmarks.benchmarker.process_context import (ProcessContext,) +from json_benchmarks.benchmarker import ( + aggregate, + benchmarker, + process_context, + result_analysis, + util_json, + visualize, +) +from json_benchmarks.benchmarker.aggregate import demo, demo_data +from json_benchmarks.benchmarker.benchmarker import ( + Benchmarker, + BenchmarkerConfig, + BenchmarkerResult, + combine_stats, + stats_dict, +) +from json_benchmarks.benchmarker.process_context import ProcessContext from json_benchmarks.benchmarker.result_analysis import ( - DEFAULT_METRIC_TO_OBJECTIVE, Result, ResultAnalysis, SkillTracker,) -from json_benchmarks.benchmarker.util_json import (ensure_json_serializable, - find_json_unserializable, - indexable_allclose,) -from json_benchmarks.benchmarker.visualize import (benchmark_analysis,) + DEFAULT_METRIC_TO_OBJECTIVE, + Result, + ResultAnalysis, + SkillTracker, +) +from json_benchmarks.benchmarker.util_json import ( + ensure_json_serializable, + find_json_unserializable, + indexable_allclose, +) +from json_benchmarks.benchmarker.visualize import benchmark_analysis -__all__ = ['Benchmarker', 'BenchmarkerConfig', 'BenchmarkerResult', - 'DEFAULT_METRIC_TO_OBJECTIVE', 'ProcessContext', 'Result', - 'ResultAnalysis', 'SkillTracker', 'aggregate', 'benchmark_analysis', - 'benchmarker', 'combine_stats', 'demo', 'demo_data', - 'ensure_json_serializable', 'find_json_unserializable', - 'indexable_allclose', 'process_context', 'result_analysis', - 'stats_dict', 'util_json', 'visualize'] +__all__ = [ + "Benchmarker", + "BenchmarkerConfig", + "BenchmarkerResult", + "DEFAULT_METRIC_TO_OBJECTIVE", + "ProcessContext", + "Result", + "ResultAnalysis", + "SkillTracker", + "aggregate", + "benchmark_analysis", + "benchmarker", + "combine_stats", + "demo", + "demo_data", + "ensure_json_serializable", + "find_json_unserializable", + "indexable_allclose", + "process_context", + "result_analysis", + "stats_dict", + "util_json", + "visualize", +] diff --git a/json_benchmarks/benchmarker/aggregate.py b/json_benchmarks/benchmarker/aggregate.py index b2d74c9..bba5771 100644 --- a/json_benchmarks/benchmarker/aggregate.py +++ b/json_benchmarks/benchmarker/aggregate.py @@ -1,31 +1,36 @@ import json + import pandas as pd import ubelt as ub def demo_data(): - from json_benchmarks.benchmarker.benchmarker import Benchmarker import numpy as np + + from json_benchmarks.benchmarker.benchmarker import Benchmarker + impl_lut = { - 'numpy': np.sum, - 'builtin': sum, - } - def data_lut(params): - item = 42 if params['dtype'] == 'int' else 42.0 - data = [item] * params['size'] - return data - basis = { - 'impl': ['builtin', 'numpy'], - 'size': [10, 10000], - 'dtype': ['int', 'float'], + "numpy": np.sum, + "builtin": sum, } - dpath = ub.Path.appdir('benchmarker/agg_demo').delete().ensuredir() + def data_lut(params): + item = 42 if params["dtype"] == "int" else 42.0 + data = [item] * params["size"] + return data + + basis = { + "impl": ["builtin", "numpy"], + "size": [10, 10000], + "dtype": ["int", "float"], + } + + dpath = ub.Path.appdir("benchmarker/agg_demo").delete().ensuredir() def run_one_benchmark(): - self = Benchmarker(name='agg_demo', num=10, bestof=3, basis=basis) + self = Benchmarker(name="agg_demo", num=10, bestof=3, basis=basis) for params in self.iter_params(): - impl = impl_lut[params['impl']] + impl = impl_lut[params["impl"]] data = data_lut(params) for timer in self.measure(): with timer: @@ -43,25 +48,26 @@ def demo_data(): def demo(): - from json_benchmarks.benchmarker import BenchmarkerResult - from json_benchmarks.benchmarker import result_analysis + from json_benchmarks.benchmarker import BenchmarkerResult, result_analysis + fpaths = demo_data() results = [] for fpath in fpaths: data = json.loads(fpath.read_text()) - for row in data['rows']: + for row in data["rows"]: result = BenchmarkerResult.load(fpath) results.extend(result.to_result_list()) analysis = result_analysis.ResultAnalysis( results, - metrics=['min', 'mean'], - params=['impl'], + metrics=["min", "mean"], + params=["impl"], metric_objectives={ - 'min': 'min', - 'mean': 'min', - }) + "min": "min", + "mean": "min", + }, + ) analysis.analysis() # single_df = pd.DataFrame(data['rows']) # context = data['context'] diff --git a/json_benchmarks/benchmarker/benchmarker.py b/json_benchmarks/benchmarker/benchmarker.py index c59a28f..008ba82 100644 --- a/json_benchmarks/benchmarker/benchmarker.py +++ b/json_benchmarks/benchmarker/benchmarker.py @@ -1,43 +1,46 @@ import json +from dataclasses import dataclass + +import numpy as np import timerit import ubelt as ub -import numpy as np -from dataclasses import dataclass + from json_benchmarks.benchmarker.process_context import ProcessContext @dataclass class BenchmarkerConfig: - name : str = None - num : int = 100 - bestof : int = 10 + name: str = None + num: int = 100 + bestof: int = 10 class BenchmarkerResult: """ Serialization for a single benchmark result """ + def __init__(self, context, rows): self.context = context self.rows = rows def __json__(self): data = { - 'type': 'benchmark_result', - 'context': self.context, - 'rows': self.rows, + "type": "benchmark_result", + "context": self.context, + "rows": self.rows, } return data @classmethod def from_json(cls, data): - assert data['type'] == 'benchmark_result' - self = cls(data['context'], data['rows']) + assert data["type"] == "benchmark_result" + self = cls(data["context"], data["rows"]) return self @classmethod def load(cls, fpath): - with open(fpath, 'r') as file: + with open(fpath) as file: data = json.load(file) self = cls.from_json(data) return self @@ -50,14 +53,15 @@ class BenchmarkerResult: List[Result] """ from json_benchmarks.benchmarker import result_analysis + results = [] for row in self.rows: result = result_analysis.Result( - name=row['name'], - metrics=row['metrics'], - params=row['params'].copy(), + name=row["name"], + metrics=row["metrics"], + params=row["params"].copy(), ) - machine = self.context['machine'] + machine = self.context["machine"] assert not ub.dict_isect(result.params, machine) result.params.update(machine) results.append(result) @@ -94,6 +98,7 @@ class Benchmarker: >>> dpath = ub.Path.appdir('benchmarker/demo').ensuredir() >>> self.dump_in_dpath(dpath) """ + def __init__(self, basis={}, verbose=1, **kwargs): self.basis = basis @@ -111,11 +116,11 @@ class Benchmarker: def dump_in_dpath(self, dpath): dpath = ub.Path(dpath) - timestamp = self.context.obj['stop_timestamp'] - fname = f'benchmarks_{self.config.name}_{timestamp}.json' + timestamp = self.context.obj["stop_timestamp"] + fname = f"benchmarks_{self.config.name}_{timestamp}.json" fpath = dpath / fname - with open(fpath, 'w') as file: + with open(fpath, "w") as file: json.dump(self.result.__json__(), file) return fpath @@ -124,10 +129,7 @@ class Benchmarker: if isinstance(self.basis, dict): grid_iter = ub.named_product(self.basis) else: - grid_iter = ub.flatten([ - ub.named_product(b) - for b in self.basis - ]) + grid_iter = ub.flatten([ub.named_product(b) for b in self.basis]) for params in grid_iter: self.params = params @@ -137,8 +139,7 @@ class Benchmarker: self.result = BenchmarkerResult(obj, self.rows) def measure(self): - for timer in self.ti.reset(self.key): - yield timer + yield from self.ti.reset(self.key) rows = self.rows ti = self.ti @@ -151,29 +152,29 @@ class Benchmarker: "time": time, } row = { - 'name': key, - 'metrics': metrics, - 'params': params, + "name": key, + "metrics": metrics, + "params": params, } rows.append(row) else: times = np.array(ti.robust_times()) - metrics = stats_dict(times, '_time') + metrics = stats_dict(times, "_time") row = { - 'metrics': metrics, - 'params': params, - 'name': key, + "metrics": metrics, + "params": params, + "name": key, } rows.append(row) -def stats_dict(data, suffix=''): +def stats_dict(data, suffix=""): stats = { - 'nobs' + suffix: len(data), - 'mean' + suffix: data.mean(), - 'std' + suffix: data.std(), - 'min' + suffix: data.min(), - 'max' + suffix: data.max(), + "nobs" + suffix: len(data), + "mean" + suffix: data.mean(), + "std" + suffix: data.std(), + "min" + suffix: data.min(), + "max" + suffix: data.max(), } return stats @@ -210,27 +211,27 @@ def combine_stats(s1, s2): """ stats = [s1, s2] data = { - 'nobs': np.array([s['nobs'] for s in stats]), - 'mean': np.array([s['mean'] for s in stats]), - 'std': np.array([s['std'] for s in stats]), - 'min': np.array([s['min'] for s in stats]), - 'max': np.array([s['max'] for s in stats]), + "nobs": np.array([s["nobs"] for s in stats]), + "mean": np.array([s["mean"] for s in stats]), + "std": np.array([s["std"] for s in stats]), + "min": np.array([s["min"] for s in stats]), + "max": np.array([s["max"] for s in stats]), } combine_stats_arrs(data) def combine_stats_arrs(data): - sizes = data['nobs'] - means = data['mean'] - stds = data['std'] - mins = data['min'] - maxs = data['max'] + sizes = data["nobs"] + means = data["mean"] + stds = data["std"] + mins = data["min"] + maxs = data["max"] varis = stds * stds combo_size = sizes.sum() combo_mean = (sizes * means).sum() / combo_size - mean_deltas = (means - combo_mean) + mean_deltas = means - combo_mean sv = (sizes * varis).sum() sm = (sizes * (mean_deltas * mean_deltas)).sum() @@ -238,10 +239,10 @@ def combine_stats_arrs(data): combo_std = np.sqrt(combo_vars) combo_stats = { - 'nobs': combo_size, - 'mean': combo_mean, - 'std': combo_std, - 'min': mins.min(), - 'max': maxs.max(), + "nobs": combo_size, + "mean": combo_mean, + "std": combo_std, + "min": mins.min(), + "max": maxs.max(), } return combo_stats diff --git a/json_benchmarks/benchmarker/process_context.py b/json_benchmarks/benchmarker/process_context.py index bce02c0..a66062a 100644 --- a/json_benchmarks/benchmarker/process_context.py +++ b/json_benchmarks/benchmarker/process_context.py @@ -1,8 +1,9 @@ -import ubelt as ub -import socket import platform +import socket import sys +import ubelt as ub + class ProcessContext: """ @@ -20,59 +21,71 @@ class ProcessContext: args = sys.argv self.obj = { - 'type': 'process_context', - 'name': name, - 'args': args, - 'config': config, - 'machine': None, - 'start_timestamp': None, - 'stop_timestamp': None, + "type": "process_context", + "name": name, + "args": args, + "config": config, + "machine": None, + "start_timestamp": None, + "stop_timestamp": None, } def _timestamp(self): import datetime - timestamp = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat() - timestamp = timestamp.replace(':', '') + + timestamp = ( + datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat() + ) + timestamp = timestamp.replace(":", "") # timestamp = ub.timestamp() return timestamp def _hostinfo(self): return { - 'host': socket.gethostname(), - 'user': ub.Path(ub.userhome()).name, + "host": socket.gethostname(), + "user": ub.Path(ub.userhome()).name, # 'cwd': os.getcwd(), } def _osinfo(self): - uname_system, _, uname_release, uname_version, _, uname_processor = platform.uname() + ( + uname_system, + _, + uname_release, + uname_version, + _, + uname_processor, + ) = platform.uname() return { - 'os_name': uname_system, - 'os_release': uname_release, - 'os_version': uname_version, - 'arch': uname_processor, + "os_name": uname_system, + "os_release": uname_release, + "os_version": uname_version, + "arch": uname_processor, } def _pyinfo(self): return { - 'py_impl': platform.python_implementation(), - 'py_version': sys.version.replace("\n", ""), + "py_impl": platform.python_implementation(), + "py_version": sys.version.replace("\n", ""), } def _meminfo(self): import psutil + # TODO: could collect memory info at start and stop and intermediate # stages. Here we just want info that is static wrt to the machine. # For now, just get the total available. svmem_info = psutil.virtual_memory() return { - 'mem_total': svmem_info.total, + "mem_total": svmem_info.total, } def _cpuinfo(self): import cpuinfo + _cpu_info = cpuinfo.get_cpu_info() cpu_info = { - 'cpu_brand': _cpu_info['brand_raw'], + "cpu_brand": _cpu_info["brand_raw"], } return cpu_info @@ -86,17 +99,21 @@ class ProcessContext: ) def start(self): - self.obj.update({ - 'machine': self._machine(), - 'start_timestamp': self._timestamp(), - 'stop_timestamp': None, - }) + self.obj.update( + { + "machine": self._machine(), + "start_timestamp": self._timestamp(), + "stop_timestamp": None, + } + ) return self def stop(self): - self.obj.update({ - 'stop_timestamp': self._timestamp(), - }) + self.obj.update( + { + "stop_timestamp": self._timestamp(), + } + ) return self.obj def __enter__(self): diff --git a/json_benchmarks/benchmarker/result_analysis.py b/json_benchmarks/benchmarker/result_analysis.py index 2b270a5..944d85e 100644 --- a/json_benchmarks/benchmarker/result_analysis.py +++ b/json_benchmarks/benchmarker/result_analysis.py @@ -1,23 +1,23 @@ import itertools as it import math +import warnings + import numpy as np import pandas as pd -import ubelt as ub -import warnings import scipy import scipy.stats # NOQA - +import ubelt as ub # a list of common objectives DEFAULT_METRIC_TO_OBJECTIVE = { - 'time': 'min', - 'ap': 'max', - 'acc': 'max', - 'f1': 'max', - 'mcc': 'max', + "time": "min", + "ap": "max", + "acc": "max", + "f1": "max", + "mcc": "max", # - 'loss': 'min', - 'brier': 'min', + "loss": "min", + "brier": "min", } @@ -49,6 +49,7 @@ class Result(ub.NiceRepr): >>> self = Result.demo(mode='alt', rng=32) >>> print('self = {}'.format(self)) """ + def __init__(self, name, params, metrics, meta=None): self.name = name self.params = params @@ -56,7 +57,7 @@ class Result(ub.NiceRepr): self.meta = meta def to_dict(self): - row = ub.dict_union({'name': self.name}, self.metrics, self.params) + row = ub.dict_union({"name": self.name}, self.metrics, self.params) return row def __nice__(self): @@ -65,41 +66,44 @@ class Result(ub.NiceRepr): return text @classmethod - def demo(cls, mode='null', rng=None): - import numpy as np + def demo(cls, mode="null", rng=None): import string + import kwarray + import numpy as np + rng = kwarray.ensure_rng(rng) - if mode == 'null': + if mode == "null": # The null hypothesis should generally be true here, # there is no relation between the results and parameters demo_param_space = { - 'param1': list(range(3)), - 'param2': np.linspace(0, 10, 10), - 'param3': list(string.ascii_lowercase[0:3]), + "param1": list(range(3)), + "param2": np.linspace(0, 10, 10), + "param3": list(string.ascii_lowercase[0:3]), } params = {k: rng.choice(b) for k, b in demo_param_space.items()} metrics = { - 'f1': rng.rand(), - 'acc': rng.rand(), + "f1": rng.rand(), + "acc": rng.rand(), } - elif mode == 'alt': + elif mode == "alt": # The alternative hypothesis should be true here, there is a # relationship between results two of the params. from scipy.special import expit + params = { - 'u': rng.randint(0, 1 + 1), - 'v': rng.randint(-1, 1 + 1), - 'x': rng.randint(-2, 3 + 1), - 'y': rng.randint(-1, 2 + 1), - 'z': rng.randint(-0, 3 + 1), + "u": rng.randint(0, 1 + 1), + "v": rng.randint(-1, 1 + 1), + "x": rng.randint(-2, 3 + 1), + "y": rng.randint(-1, 2 + 1), + "z": rng.randint(-0, 3 + 1), } noise = np.random.randn() * 1 - r = 3 * params['x'] + params['y'] ** 2 + 0.3 * params['z'] ** 3 + r = 3 * params["x"] + params["y"] ** 2 + 0.3 * params["z"] ** 3 acc = expit(r / 20 + noise) metrics = { - 'acc': acc, + "acc": acc, } else: raise KeyError(mode) @@ -210,10 +214,18 @@ class ResultAnalysis(ub.NiceRepr): ttest_ind: p=0.7626 """ - def __init__(self, results, metrics=None, params=None, ignore_params=None, - ignore_metrics=None, metric_objectives=None, - abalation_orders={1}, default_objective='max', - p_threshold=0.05): + def __init__( + self, + results, + metrics=None, + params=None, + ignore_params=None, + ignore_metrics=None, + metric_objectives=None, + abalation_orders={1}, + default_objective="max", + p_threshold=0.05, + ): self.results = results if ignore_metrics is None: ignore_metrics = set() @@ -237,21 +249,22 @@ class ResultAnalysis(ub.NiceRepr): self.p_threshold = p_threshold self._description = {} - self._description['built'] = False - self._description['num_results'] = len(self.results) + self._description["built"] = False + self._description["num_results"] = len(self.results) def __nice__(self): return ub.repr2(self._description, si=1, sv=1) @classmethod - def demo(cls, num=10, mode='null', rng=None): + def demo(cls, num=10, mode="null", rng=None): import kwarray + rng = kwarray.ensure_rng(rng) results = [Result.demo(mode=mode, rng=rng) for _ in range(num)] - if mode == 'null': - self = cls(results, metrics={'f1', 'acc'}) + if mode == "null": + self = cls(results, metrics={"f1", "acc"}) else: - self = cls(results, metrics={'acc'}) + self = cls(results, metrics={"acc"}) return self def run(self): @@ -284,7 +297,8 @@ class ResultAnalysis(ub.NiceRepr): # remove nans varied = { k: {v for v in vs if not (isinstance(v, float) and math.isnan(v))} - for k, vs in varied.items()} + for k, vs in varied.items() + } varied = {k: vs for k, vs in varied.items() if len(vs)} return varied @@ -339,9 +353,9 @@ class ResultAnalysis(ub.NiceRepr): """ objective = self.metric_objectives.get(metric_key, None) if objective is None: - warnings.warn(f'warning assume {self.default_objective} for {metric_key=}') + warnings.warn(f"warning assume {self.default_objective} for {metric_key=}") objective = self.default_objective - ascending = (objective == 'min') + ascending = objective == "min" return ascending def abalate(self, param_group): @@ -367,10 +381,13 @@ class ResultAnalysis(ub.NiceRepr): # For hashable generic dictionary from collections import namedtuple - gd = namedtuple('config', param_group) + + gd = namedtuple("config", param_group) # from types import SimpleNamespace - param_unique_vals_ = self.table[param_group].drop_duplicates().to_dict('records') + param_unique_vals_ = ( + self.table[param_group].drop_duplicates().to_dict("records") + ) param_unique_vals = [gd(**d) for d in param_unique_vals_] # param_unique_vals = {p: self.table[p].unique().tolist() for p in param_group} score_improvements = ub.ddict(list) @@ -405,19 +422,29 @@ class ResultAnalysis(ub.NiceRepr): best_group.set_index(param_group) # best_group[param_group] # best_group[metric_key].diff() - scored_ranking = best_group[param_group + [metric_key]].reset_index(drop=True) + scored_ranking = best_group[param_group + [metric_key]].reset_index( + drop=True + ) scored_obs.append(scored_ranking) - ranking = [gd(**d) for d in scored_ranking[param_group].to_dict('records')] + ranking = [ + gd(**d) for d in scored_ranking[param_group].to_dict("records") + ] skillboard.observe(ranking) - print('skillboard.ratings = {}'.format(ub.repr2(skillboard.ratings, nl=1, align=':'))) + print( + "skillboard.ratings = {}".format( + ub.repr2(skillboard.ratings, nl=1, align=":") + ) + ) win_probs = skillboard.predict_win() - print('win_probs = {}'.format(ub.repr2(win_probs, nl=1))) + print(f"win_probs = {ub.repr2(win_probs, nl=1)}") for key, improves in score_improvements.items(): k1, k2, metric_key = key improves = np.array(improves) pos_delta = improves[improves > 0] - print(f'\nWhen {k1} is better than {k2}, the improvement in {metric_key} is') + print( + f"\nWhen {k1} is better than {k2}, the improvement in {metric_key} is" + ) print(pd.DataFrame([pd.Series(pos_delta).describe().T])) return scored_obs @@ -441,10 +468,10 @@ class ResultAnalysis(ub.NiceRepr): >>> stats_row = self.test_group(param_group, metric_key) >>> print('stats_row = {}'.format(ub.repr2(stats_row, nl=2, sort=0, precision=2))) """ - param_group_name = ','.join(param_group) + param_group_name = ",".join(param_group) stats_row = { - 'param_name': param_group_name, - 'metric': metric_key, + "param_name": param_group_name, + "metric": metric_key, } # param_values = varied[param_name] # stats_row['param_values'] = param_values @@ -463,7 +490,7 @@ class ResultAnalysis(ub.NiceRepr): nuisance_cols = sorted(set(self.varied.keys()) - set(param_group)) for param_value, group in self.table.groupby(param_group): - metric_group = group[['name', metric_key] + varied_cols] + metric_group = group[["name", metric_key] + varied_cols] metric_vals = metric_group[metric_key] metric_vals = metric_vals.dropna() if len(metric_vals) > 0: @@ -473,18 +500,24 @@ class ResultAnalysis(ub.NiceRepr): value_to_metric[param_value] = metric_vals.values moments = pd.DataFrame(value_to_metric_stats).T - moments = moments.sort_values('mean', ascending=ascending) + moments = moments.sort_values("mean", ascending=ascending) moments.index.name = param_group_name moments.columns.name = metric_key - ranking = moments['mean'].index.to_list() + ranking = moments["mean"].index.to_list() param_to_rank = ub.invert_dict(dict(enumerate(ranking))) # Determine a set of value pairs to do pairwise comparisons on value_pairs = ub.oset() # value_pairs.update( # map(frozenset, ub.iter_window(moments.index, 2))) - value_pairs.update(map(frozenset, ub.iter_window( - moments.sort_values('mean', ascending=ascending).index, 2))) + value_pairs.update( + map( + frozenset, + ub.iter_window( + moments.sort_values("mean", ascending=ascending).index, 2 + ), + ) + ) # https://en.wikipedia.org/wiki/Kruskal%E2%80%93Wallis_one-way_analysis_of_variance # If the researcher can make the assumptions of an identically @@ -508,11 +541,11 @@ class ResultAnalysis(ub.NiceRepr): else: anova_1way_result = scipy.stats.stats.F_onewayResult(np.nan, np.nan) - stats_row['anova_rank_H'] = anova_krus_result.statistic - stats_row['anova_rank_p'] = anova_krus_result.pvalue - stats_row['anova_mean_F'] = anova_1way_result.statistic - stats_row['anova_mean_p'] = anova_1way_result.pvalue - stats_row['moments'] = moments + stats_row["anova_rank_H"] = anova_krus_result.statistic + stats_row["anova_rank_p"] = anova_krus_result.pvalue + stats_row["anova_mean_F"] = anova_1way_result.statistic + stats_row["anova_mean_p"] = anova_1way_result.pvalue + stats_row["moments"] = moments pair_stats_list = [] for pair in value_pairs: @@ -524,43 +557,50 @@ class ResultAnalysis(ub.NiceRepr): rank1 = param_to_rank[param_val1] rank2 = param_to_rank[param_val2] - pair_stats['winner'] = param_val1 if rank1 < rank2 else param_val2 - pair_stats['value1'] = param_val1 - pair_stats['value2'] = param_val2 - pair_stats['n1'] = len(metric_vals1) - pair_stats['n2'] = len(metric_vals2) + pair_stats["winner"] = param_val1 if rank1 < rank2 else param_val2 + pair_stats["value1"] = param_val1 + pair_stats["value2"] = param_val2 + pair_stats["n1"] = len(metric_vals1) + pair_stats["n2"] = len(metric_vals2) TEST_ONLY_FOR_DIFFERENCE = True if TEST_ONLY_FOR_DIFFERENCE: if ascending: # We want to minimize the metric - alternative = 'less' if rank1 < rank2 else 'greater' + alternative = "less" if rank1 < rank2 else "greater" else: # We want to maximize the metric - alternative = 'greater' if rank1 < rank2 else 'less' + alternative = "greater" if rank1 < rank2 else "less" else: - alternative = 'two-sided' + alternative = "two-sided" ind_kw = dict( equal_var=False, alternative=alternative, ) - ttest_ind_result = scipy.stats.ttest_ind(metric_vals1, metric_vals2, **ind_kw) + ttest_ind_result = scipy.stats.ttest_ind( + metric_vals1, metric_vals2, **ind_kw + ) if 0: from benchmarker.benchmarker import stats_dict + stats1 = stats_dict(metric_vals1) stats2 = stats_dict(metric_vals2) scipy.stats.ttest_ind_from_stats( - stats1['mean'], stats1['std'], stats1['nobs'], - stats2['mean'], stats2['std'], stats2['nobs'], - **ind_kw + stats1["mean"], + stats1["std"], + stats1["nobs"], + stats2["mean"], + stats2["std"], + stats2["nobs"], + **ind_kw, ) # metric_vals1, metric_vals2, equal_var=False) scipy.stats.ttest_ind_from_stats - pair_stats['ttest_ind'] = ttest_ind_result + pair_stats["ttest_ind"] = ttest_ind_result # Do relative checks, need to find comparable subgroups metric_group1 = value_to_metric_group[param_val1] @@ -588,18 +628,21 @@ class ResultAnalysis(ub.NiceRepr): # Does this need to have the values aligned? # I think that is the case giving my understanding of paired # t-tests, but the docs need a PR to make that more clear. - ttest_rel_result = scipy.stats.ttest_rel(comparable_groups1, comparable_groups2) - pair_stats['n_common'] = len(common) - pair_stats['ttest_rel'] = ttest_rel_result + ttest_rel_result = scipy.stats.ttest_rel( + comparable_groups1, comparable_groups2 + ) + pair_stats["n_common"] = len(common) + pair_stats["ttest_rel"] = ttest_rel_result pair_stats_list.append(pair_stats) - stats_row['pairwise'] = pair_stats_list + stats_row["pairwise"] = pair_stats_list return stats_row def build(self): import itertools as it + if len(self.results) < 2: - raise Exception('need at least 2 results') + raise Exception("need at least 2 results") varied = self.varied.copy() if self.ignore_params: @@ -614,21 +657,26 @@ class ResultAnalysis(ub.NiceRepr): # settings, for each group setting do the k=1 analysis within that group varied_param_names = list(varied.keys()) num_varied_params = len(varied) - held_constant_orders = {num_varied_params + i if i < 0 else i for i in self.abalation_orders} + held_constant_orders = { + num_varied_params + i if i < 0 else i for i in self.abalation_orders + } held_constant_orders = [i for i in held_constant_orders if i > 0] held_constant_groups = [] for k in held_constant_orders: held_constant_groups.extend( - list(map(list, it.combinations(varied_param_names, k)))) + list(map(list, it.combinations(varied_param_names, k))) + ) if self.metrics is None: - avail_metrics = set.intersection(*[set(r.metrics.keys()) for r in self.results]) + avail_metrics = set.intersection( + *[set(r.metrics.keys()) for r in self.results] + ) metrics_of_interest = sorted(avail_metrics - set(self.ignore_metrics)) else: metrics_of_interest = self.metrics self.metrics_of_interest = metrics_of_interest - self._description['metrics_of_interest'] = metrics_of_interest - self._description['num_groups'] = len(held_constant_groups) + self._description["metrics_of_interest"] = metrics_of_interest + self._description["num_groups"] = len(held_constant_groups) # Analyze the impact of each parameter self.statistics = statistics = [] @@ -637,24 +685,29 @@ class ResultAnalysis(ub.NiceRepr): stats_row = self.test_group(param_group, metric_key) statistics.append(stats_row) - self.stats_table = pd.DataFrame([ - ub.dict_diff(d, {'pairwise', 'param_values', 'moments'}) - for d in self.statistics]) + self.stats_table = pd.DataFrame( + [ + ub.dict_diff(d, {"pairwise", "param_values", "moments"}) + for d in self.statistics + ] + ) if len(self.stats_table): - self.stats_table = self.stats_table.sort_values('anova_rank_p') + self.stats_table = self.stats_table.sort_values("anova_rank_p") - self._description['built'] = True + self._description["built"] = True def report(self): - stat_groups = ub.group_items(self.statistics, key=lambda x: x['param_name']) + stat_groups = ub.group_items(self.statistics, key=lambda x: x["param_name"]) stat_groups_items = list(stat_groups.items()) # Modify this order to change the grouping pattern - grid = ub.named_product({ - 'stat_group_item': stat_groups_items, - 'metrics': self.metrics_of_interest, - }) + grid = ub.named_product( + { + "stat_group_item": stat_groups_items, + "metrics": self.metrics_of_interest, + } + ) for grid_item in grid: self._report_one(grid_item) @@ -662,58 +715,76 @@ class ResultAnalysis(ub.NiceRepr): def _report_one(self, grid_item): p_threshold = self.p_threshold - metric_key = grid_item['metrics'] - stat_groups_item = grid_item['stat_group_item'] + metric_key = grid_item["metrics"] + stat_groups_item = grid_item["stat_group_item"] param_name, stat_group = stat_groups_item - stats_row = ub.group_items(stat_group, key=lambda x: x['metric'])[metric_key][0] - title = ('PARAMETER: {} - METRIC: {}'.format(param_name, metric_key)) - print('\n\n') + stats_row = ub.group_items(stat_group, key=lambda x: x["metric"])[metric_key][0] + title = f"PARAMETER: {param_name} - METRIC: {metric_key}" + print("\n\n") print(title) - print('=' * len(title)) - print(stats_row['moments']) - anova_rank_p = stats_row['anova_rank_p'] - anova_mean_p = stats_row['anova_mean_p'] + print("=" * len(title)) + print(stats_row["moments"]) + anova_rank_p = stats_row["anova_rank_p"] + anova_mean_p = stats_row["anova_mean_p"] # Rougly speaking - print('') - print(f'ANOVA: If p is low, the param {param_name!r} might have an effect') - print(ub.color_text(f' Rank-ANOVA: p={anova_rank_p:0.8f}', - 'green' if anova_rank_p < p_threshold else None)) - print(ub.color_text(f' Mean-ANOVA: p={anova_mean_p:0.8f}', - 'green' if anova_mean_p < p_threshold else None)) - print('') - print('Pairwise T-Tests') - for pairstat in stats_row['pairwise']: + print("") + print(f"ANOVA: If p is low, the param {param_name!r} might have an effect") + print( + ub.color_text( + f" Rank-ANOVA: p={anova_rank_p:0.8f}", + "green" if anova_rank_p < p_threshold else None, + ) + ) + print( + ub.color_text( + f" Mean-ANOVA: p={anova_mean_p:0.8f}", + "green" if anova_mean_p < p_threshold else None, + ) + ) + print("") + print("Pairwise T-Tests") + for pairstat in stats_row["pairwise"]: # Is this backwards? - value1 = pairstat['value1'] - value2 = pairstat['value2'] - winner = pairstat['winner'] + value1 = pairstat["value1"] + value2 = pairstat["value2"] + winner = pairstat["winner"] if value2 == winner: value1, value2 = value2, value1 - print(f' If p is low, {param_name}={value1} may outperform {param_name}={value2}.') - if 'ttest_ind' in pairstat: - ttest_ind_result = pairstat['ttest_ind'] - print(ub.color_text(f' ttest_ind: p={ttest_ind_result.pvalue:0.8f}', - 'green' if ttest_ind_result.pvalue < p_threshold else None)) - if 'ttest_rel' in pairstat: - n_common = pairstat['n_common'] - ttest_rel_result = pairstat['ttest_ind'] - print(ub.color_text(f' ttest_rel: p={ttest_rel_result.pvalue:0.8f}, n_pairs={n_common}', - 'green' if ttest_rel_result.pvalue < p_threshold else None)) + print( + f" If p is low, {param_name}={value1} may outperform {param_name}={value2}." + ) + if "ttest_ind" in pairstat: + ttest_ind_result = pairstat["ttest_ind"] + print( + ub.color_text( + f" ttest_ind: p={ttest_ind_result.pvalue:0.8f}", + "green" if ttest_ind_result.pvalue < p_threshold else None, + ) + ) + if "ttest_rel" in pairstat: + n_common = pairstat["n_common"] + ttest_rel_result = pairstat["ttest_ind"] + print( + ub.color_text( + f" ttest_rel: p={ttest_rel_result.pvalue:0.8f}, n_pairs={n_common}", + "green" if ttest_rel_result.pvalue < p_threshold else None, + ) + ) def conclusions(self): conclusions = [] for stat in self.statistics: - param_name = stat['param_name'] - metric = stat['metric'] - for pairstat in stat['pairwise']: - value1 = pairstat['value1'] - value2 = pairstat['value2'] - winner = pairstat['winner'] + param_name = stat["param_name"] + metric = stat["metric"] + for pairstat in stat["pairwise"]: + value1 = pairstat["value1"] + value2 = pairstat["value2"] + winner = pairstat["winner"] if value2 == winner: value1, value2 = value2, value1 - pvalue = stat = pairstat['ttest_ind'].pvalue - txt = (f'p={pvalue:0.8f}, If p is low, {param_name}={value1} may outperform {value2} on {metric}.') + pvalue = stat = pairstat["ttest_ind"].pvalue + txt = f"p={pvalue:0.8f}, If p is low, {param_name}={value1} may outperform {value2} on {metric}." conclusions.append(txt) return conclusions @@ -750,22 +821,24 @@ class ResultAnalysis(ub.NiceRepr): >>> self.plot(xlabel, metric_key, group_labels) """ import seaborn as sns + sns.set() from matplotlib import pyplot as plt # NOQA + data = self.table data = data.sort_values(metric_key) for gname, labels in group_labels.items(): if len(labels): new_col = [] - for row in data[labels].to_dict('records'): + for row in data[labels].to_dict("records"): item = ub.repr2(row, compact=1, si=1) new_col.append(item) gkey = gname + "_key" data[gkey] = new_col plot_kws = { - 'x': xlabel, - 'y': metric_key, + "x": xlabel, + "y": metric_key, } for gname, labels in group_labels.items(): if labels: @@ -776,34 +849,34 @@ class ResultAnalysis(ub.NiceRepr): fig_params = plot_kws.pop("fig", []) facet_kws = { - 'sharex': True, - 'sharey': True, + "sharex": True, + "sharey": True, } # facet_kws['col'] = plot_kws.pop("col", None) # facet_kws['row'] = plot_kws.pop("row", None) # if not facet_kws['row']: # facet_kws['col_wrap'] = 5 - plot_kws['row'] = plot_kws.get("row", None) + plot_kws["row"] = plot_kws.get("row", None) # if not plot_kws['row']: # plot_kws['col_wrap'] = 5 if not fig_params: - groups = [('', data)] + groups = [("", data)] else: groups = data.groupby(fig_params) - if 'marker' not in plot_kws: - plot_kws['marker'] = "o" + if "marker" not in plot_kws: + plot_kws["marker"] = "o" # We will want to overwrite this with our own std estimate - plot_kws['ci'] = "sd" + plot_kws["ci"] = "sd" # err_style='band', # err_kws=None, # Use a consistent pallete across plots - unique_hues = data['hue_key'].unique() + unique_hues = data["hue_key"].unique() palette = ub.dzip(unique_hues, sns.color_palette(n_colors=len(unique_hues))) - plot_kws['palette'] = palette + plot_kws["palette"] = palette plots = [] base_fnum = 1 @@ -819,9 +892,10 @@ class ResultAnalysis(ub.NiceRepr): facet = sns.relplot( data=group, # kind='line', - kind='scatter', + kind="scatter", facet_kws=facet_kws, - **plot_kws) + **plot_kws, + ) fig = facet.figure fig.suptitle(fig_key) @@ -829,10 +903,12 @@ class ResultAnalysis(ub.NiceRepr): # facet = sns.FacetGrid(group, **facet_kws) # facet.map_dataframe(sns.lineplot, x=xlabel, y=metric_key, **plot_kws) # facet.add_legend() - plots.append({ - 'fig': fig, - 'facet': facet, - }) + plots.append( + { + "fig": fig, + "facet": facet, + } + ) return plots @@ -866,6 +942,7 @@ class SkillTracker: def __init__(self, player_ids): import openskill + self.player_ids = player_ids self.ratings = {m: openskill.Rating() for m in player_ids} # self.observations = [] @@ -879,6 +956,7 @@ class SkillTracker: Dict[T, float]: mapping from player ids to win probabilites """ from openskill import predict_win + teams = [[p] for p in list(self.ratings.keys())] ratings = [[r] for r in self.ratings.values()] probs = predict_win(ratings) @@ -897,6 +975,7 @@ class SkillTracker: winners are at the front (0-th place) of the list. """ import openskill + # self.observations.append(ranking) ratings = self.ratings team_standings = [[r] for r in ub.take(ratings, ranking)] diff --git a/json_benchmarks/benchmarker/util_json.py b/json_benchmarks/benchmarker/util_json.py index dc3da85..7930b33 100644 --- a/json_benchmarks/benchmarker/util_json.py +++ b/json_benchmarks/benchmarker/util_json.py @@ -1,9 +1,10 @@ import copy +import json +import pathlib +from collections import OrderedDict + import numpy as np import ubelt as ub -import json -from collections import OrderedDict -import pathlib def ensure_json_serializable(dict_, normalize_containers=False, verbose=0): @@ -64,7 +65,7 @@ def ensure_json_serializable(dict_, normalize_containers=False, verbose=0): elif isinstance(value, pathlib.Path): new_value = str(value) walker[prefix] = new_value - elif hasattr(value, '__json__'): + elif hasattr(value, "__json__"): new_value = value.__json__() walker[prefix] = new_value elif normalize_containers: @@ -159,9 +160,9 @@ def find_json_unserializable(data, quickcheck=False): # Purposely make loc non-hashable so its not confused with # an address. All we can know in this case is that they key # is at this level, there is no concept of where. - yield {'loc': root + [['.keys', key]], 'data': key} + yield {"loc": root + [[".keys", key]], "data": key} elif not isinstance(value, serializable_types): - yield {'loc': prefix, 'data': value} + yield {"loc": prefix, "data": value} def indexable_allclose(dct1, dct2, return_info=False): @@ -189,19 +190,21 @@ def indexable_allclose(dct1, dct2, return_info=False): walker1 = ub.IndexableWalker(dct1) walker2 = ub.IndexableWalker(dct2) flat_items1 = [ - (path, value) for path, value in walker1 - if not isinstance(value, walker1.indexable_cls) or len(value) == 0] + (path, value) + for path, value in walker1 + if not isinstance(value, walker1.indexable_cls) or len(value) == 0 + ] flat_items2 = [ - (path, value) for path, value in walker2 - if not isinstance(value, walker1.indexable_cls) or len(value) == 0] + (path, value) + for path, value in walker2 + if not isinstance(value, walker1.indexable_cls) or len(value) == 0 + ] flat_items1 = sorted(flat_items1) flat_items2 = sorted(flat_items2) if len(flat_items1) != len(flat_items2): - info = { - 'faillist': ['length mismatch'] - } + info = {"faillist": ["length mismatch"]} final_flag = False else: passlist = [] @@ -212,9 +215,13 @@ def indexable_allclose(dct1, dct2, return_info=False): p2, v2 = t2 assert p1 == p2 - flag = (v1 == v2) + flag = v1 == v2 if not flag: - if isinstance(v1, float) and isinstance(v2, float) and np.isclose(v1, v2): + if ( + isinstance(v1, float) + and isinstance(v2, float) + and np.isclose(v1, v2) + ): flag = True if flag: passlist.append(p1) @@ -223,8 +230,8 @@ def indexable_allclose(dct1, dct2, return_info=False): final_flag = len(faillist) == 0 info = { - 'passlist': passlist, - 'faillist': faillist, + "passlist": passlist, + "faillist": faillist, } if return_info: diff --git a/json_benchmarks/benchmarker/visualize.py b/json_benchmarks/benchmarker/visualize.py index 41f4679..f3d683a 100644 --- a/json_benchmarks/benchmarker/visualize.py +++ b/json_benchmarks/benchmarker/visualize.py @@ -2,7 +2,12 @@ import pandas as pd import ubelt as ub -def benchmark_analysis(rows, xlabel, group_labels, basis, ): +def benchmark_analysis( + rows, + xlabel, + group_labels, + basis, +): # xlabel = "size" # Set these to empty lists if they are not used # group_labels = { @@ -18,6 +23,7 @@ def benchmark_analysis(rows, xlabel, group_labels, basis, ): # key = ub.repr2(params, compact=1, si=1) from process_tracker.result_analysis import SkillTracker + RECORD_ALL = 0 USE_OPENSKILL = True diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py index 1bf6e65..306bf67 100644 --- a/json_benchmarks/core.py +++ b/json_benchmarks/core.py @@ -2,10 +2,11 @@ Main definition of the benchmarks """ import json -import ubelt as ub + import scriptconfig as scfg -from json_benchmarks import benchmarker -from json_benchmarks import datagen +import ubelt as ub + +from json_benchmarks import benchmarker, datagen KNOWN_LIBRARIES = [ "ujson", @@ -20,39 +21,49 @@ class JSONBenchmarkConfig(scfg.Config): """ Benchmark JSON implementations """ - default = { - 'disable': scfg.Value([], choices=KNOWN_LIBRARIES, help=ub.paragraph( - ''' - Remove specified libraries from the benchmarks - ''' - )), - 'factor': scfg.Value(1.0, help=ub.paragraph( - ''' + default = { + "disable": scfg.Value( + [], + choices=KNOWN_LIBRARIES, + help=ub.paragraph( + """ + Remove specified libraries from the benchmarks + """ + ), + ), + "factor": scfg.Value( + 1.0, + help=ub.paragraph( + """ Specify as a fraction to speed up benchmarks for development / testing - ''')), - - 'cache_dir': scfg.Value(None, help=ub.paragraph( - ''' + """ + ), + ), + "cache_dir": scfg.Value( + None, + help=ub.paragraph( + """ Location for benchmark cache. Defaults to $XDG_CACHE/ujson/benchmark_results/ - ''')), + """ + ), + ), } def normalize(self): - dpath = self['cache_dir'] + dpath = self["cache_dir"] if dpath is None: - dpath = ub.Path.appdir('ujson/benchmark_results') + dpath = ub.Path.appdir("ujson/benchmark_results") dpath = ub.Path(dpath) - self['cache_dir'] = dpath + self["cache_dir"] = dpath def available_json_impls(): import importlib - known_modnames = [ - 'ujson', 'json', 'nujson', 'orjson', 'simplejson' - ] + + known_modnames = ["ujson", "json", "nujson", "orjson", "simplejson"] json_impls = {} for libname in known_modnames: try: @@ -61,8 +72,8 @@ def available_json_impls(): pass else: json_impls[libname] = { - 'module': module, - 'version': module.__version__, + "module": module, + "version": module.__version__, } return json_impls @@ -75,15 +86,15 @@ def benchmark_json(): # These are the parameters that we benchmark over common_basis = { "impl": list(json_impls.keys()), - "func": ['dumps', 'loads'], + "func": ["dumps", "loads"], } sized_basis = { "input": [ - 'Array with doubles', - 'Array with UTF-8 strings', + "Array with doubles", + "Array with UTF-8 strings", # 'Medium complex object', - 'Array with True values', - 'Array of Dict[str, int]', + "Array with True values", + "Array of Dict[str, int]", # 'Dict of List[Dict[str, int]]', # 'Complex object' ], @@ -91,10 +102,8 @@ def benchmark_json(): # 1024, 2048, 4096, 8192, 12288], } predefined_basis = { - "input": [ - 'Complex object' - ], - 'size': [None], + "input": ["Complex object"], + "size": [None], } basis = [ @@ -106,7 +115,7 @@ def benchmark_json(): # abstract away the details of timing a process over a grid of parameters, # serializing the results, and aggregating results from disparate runs. benchmark = benchmarker.Benchmarker( - name='bench_json', + name="bench_json", num=100, bestof=10, verbose=3, @@ -114,7 +123,7 @@ def benchmark_json(): ) def is_blocked(params): - if params['input'] == 'Complex object' and params['impl'] == 'orjson': + if params["input"] == "Complex object" and params["impl"] == "orjson": return True # For each variation of your experiment, create a row. @@ -124,12 +133,12 @@ def benchmark_json(): # Make any modifications you need to compute input kwargs for each # method here. impl_info = json_impls[params["impl"]] - params["impl_version"] = impl_info['version'] - module = impl_info['module'] - if params['func'] == 'dumps': + params["impl_version"] = impl_info["version"] + module = impl_info["module"] + if params["func"] == "dumps": method = module.dumps data = data_lut[params["input"]](params["size"]) - elif params['func'] == 'loads': + elif params["func"] == "loads": method = module.loads to_encode = data_lut[params["input"]](params["size"]) data = json.dumps(to_encode) @@ -142,17 +151,18 @@ def benchmark_json(): # Put the logic you want to time here method(data) - dpath = ub.Path.appdir('ujson/benchmark_results').ensuredir() + dpath = ub.Path.appdir("ujson/benchmark_results").ensuredir() result_fpath = benchmark.dump_in_dpath(dpath) return result_fpath def aggregate_results(result_fpaths): import json + results = [] for fpath in result_fpaths: data = json.loads(fpath.read_text()) - for row in data['rows']: + for row in data["rows"]: result = benchmarker.BenchmarkerResult.load(fpath) results.extend(result.to_result_list()) @@ -164,12 +174,13 @@ def aggregate_results(result_fpaths): analysis = benchmarker.result_analysis.ResultAnalysis( results, metrics=[metric_key], - params=['impl'], + params=["impl"], metric_objectives={ - 'min_time': 'min', - 'mean_time': 'min', - 'time': 'min', - }) + "min_time": "min", + "mean_time": "min", + "time": "min", + }, + ) analysis.analysis() table = analysis.table @@ -182,21 +193,23 @@ def aggregate_results(result_fpaths): otherwise the new column for that row is set to None. """ import pandas as pd + # Stats groupings stats_cols = [ - 'nobs_time', - 'std_time', - 'mean_time', - 'max_time', - 'min_time', + "nobs_time", + "std_time", + "mean_time", + "max_time", + "min_time", ] - mapper = {c: c.replace('_time', '') for c in stats_cols} + mapper = {c: c.replace("_time", "") for c in stats_cols} unmapper = ub.invert_dict(mapper) non_stats_cols = list(ub.oset(data.columns) - stats_cols) if group_keys is None: group_keys = non_stats_cols non_group_keys = list(ub.oset(non_stats_cols) - group_keys) from json_benchmarks.benchmarker.benchmarker import combine_stats_arrs + new_rows = [] for group_vals, group in list(data.groupby(group_keys)): # hack, is this a pandas bug in 1.4.1? Is it fixed @@ -218,16 +231,18 @@ def aggregate_results(result_fpaths): new_data = pd.DataFrame(new_rows) return new_data - single_size = table[(table['size'] == 256) | table['size'].isnull()] + single_size = table[(table["size"] == 256) | table["size"].isnull()] # single_size_combo = aggregate_time_stats(single_size, None) - single_size_combo = aggregate_time_stats(single_size, ['name']) + single_size_combo = aggregate_time_stats(single_size, ["name"]) - param_group = ['impl', 'impl_version'] - single_size_combo['calls/sec'] = 1 / single_size_combo['mean_time'] + param_group = ["impl", "impl_version"] + single_size_combo["calls/sec"] = 1 / single_size_combo["mean_time"] _single_size_combo = single_size_combo.copy() - _single_size_combo['calls/sec'] = _single_size_combo['calls/sec'].apply(lambda x: '{:,.02f}'.format(x)) - piv = _single_size_combo.pivot(['input', 'func'], param_group, 'calls/sec') - print('Table for size=256') + _single_size_combo["calls/sec"] = _single_size_combo["calls/sec"].apply( + lambda x: f"{x:,.02f}" + ) + piv = _single_size_combo.pivot(["input", "func"], param_group, "calls/sec") + print("Table for size=256") print(piv) analysis.abalate(param_group) @@ -242,29 +257,31 @@ def aggregate_results(result_fpaths): "size": [], } import kwplot + kwplot.autosns() plots = analysis.plot(xlabel, metric_key, group_labels) for plot in plots: - for ax in plot['facet'].axes.ravel(): - ax.set_xscale('log') - ax.set_yscale('log') + for ax in plot["facet"].axes.ravel(): + ax.set_xscale("log") + ax.set_yscale("log") kwplot.show_if_requested() def main(): from json_benchmarks import core + config = core.JSONBenchmarkConfig(cmdline=True) - dpath = config['cache_dir'] + dpath = config["cache_dir"] run = 1 if run: result_fpath = core.benchmark_json() - print('result_fpath = {!r}'.format(result_fpath)) + print(f"result_fpath = {result_fpath!r}") result_fpaths = [result_fpath] agg = 1 if agg: - result_fpaths = list(dpath.glob('benchmarks*.json')) + result_fpaths = list(dpath.glob("benchmarks*.json")) core.aggregate_results(result_fpaths) # results_output_table(libraries) diff --git a/json_benchmarks/datagen.py b/json_benchmarks/datagen.py index afb2708..f7f1f71 100644 --- a/json_benchmarks/datagen.py +++ b/json_benchmarks/datagen.py @@ -1,5 +1,6 @@ import random import sys + import ubelt as ub @@ -22,21 +23,23 @@ def json_test_data_generators(): >>> print('test_object = {!r}'.format(test_object)) """ data_lut = {} + def _register_data(name): def _wrap(func): data_lut[name] = func + return _wrap # seed if desired # rng = random.Random(0) rng = random - @_register_data('Array with doubles') + @_register_data("Array with doubles") def array_with_doubles(size): test_object = [sys.maxsize * rng.random() for _ in range(size)] return test_object - @_register_data('Array with UTF-8 strings') + @_register_data("Array with UTF-8 strings") def array_with_utf8_strings(size): utf8_string = ( "نظام الحكم سلطاني وراثي " @@ -46,7 +49,7 @@ def json_test_data_generators(): test_object = [utf8_string for _ in range(size)] return test_object - @_register_data('Medium complex object') + @_register_data("Medium complex object") def medium_complex_object(size): user = { "userId": 3381293, @@ -63,20 +66,19 @@ def json_test_data_generators(): test_object = [[user, friends] for _ in range(size)] return test_object - @_register_data('Array with True values') + @_register_data("Array with True values") def true_values(size): test_object = [True for _ in range(size)] return test_object - @_register_data('Array of Dict[str, int]') + @_register_data("Array of Dict[str, int]") def array_of_dict_string_int(size): test_object = [ - {str(rng.random() * 20): int(rng.random() * 1000000)} - for _ in range(size) + {str(rng.random() * 20): int(rng.random() * 1000000)} for _ in range(size) ] return test_object - @_register_data('Dict of List[Dict[str, int]]') + @_register_data("Dict of List[Dict[str, int]]") def dict_of_list_dict_str_int(size): keys = set() while len(keys) < size: @@ -91,23 +93,25 @@ def json_test_data_generators(): } return test_object - @_register_data('Complex object') + @_register_data("Complex object") def complex_object(size): import json + # TODO: might be better to reigster this file with setup.py or # download it via some mechanism try: dpath = ub.Path(__file__).parent - fpath = dpath / 'sample.json' + fpath = dpath / "sample.json" if not fpath.exists(): raise Exception except Exception: import ujson - dpath = ub.Path(ujson.__file__).parent / 'tests' - fpath = dpath / 'sample.json' + + dpath = ub.Path(ujson.__file__).parent / "tests" + fpath = dpath / "sample.json" if not fpath.exists(): raise Exception - with open(fpath, 'r') as f: + with open(fpath) as f: test_object = json.load(f) if size is not None: test_object = [test_object] * size From 78cbf7ea71766bafc6dd49cb3c13eba278317fbf Mon Sep 17 00:00:00 2001 From: joncrall Date: Sat, 28 May 2022 21:56:26 -0400 Subject: [PATCH 09/25] tweaks --- json_benchmarks/benchmarker/__init__.py | 84 +++---- json_benchmarks/benchmarker/benchmarker.py | 83 +------ .../benchmarker/result_analysis.py | 33 ++- json_benchmarks/benchmarker/util_stats.py | 235 ++++++++++++++++++ json_benchmarks/core.py | 157 ++++++------ 5 files changed, 367 insertions(+), 225 deletions(-) create mode 100644 json_benchmarks/benchmarker/util_stats.py diff --git a/json_benchmarks/benchmarker/__init__.py b/json_benchmarks/benchmarker/__init__.py index ee32ea0..5614b61 100644 --- a/json_benchmarks/benchmarker/__init__.py +++ b/json_benchmarks/benchmarker/__init__.py @@ -9,57 +9,35 @@ mkinit ~/code/ultrajson/json_benchmarks/benchmarker/__init__.py -w __version__ = "0.1.0" -from json_benchmarks.benchmarker import ( - aggregate, - benchmarker, - process_context, - result_analysis, - util_json, - visualize, -) -from json_benchmarks.benchmarker.aggregate import demo, demo_data -from json_benchmarks.benchmarker.benchmarker import ( - Benchmarker, - BenchmarkerConfig, - BenchmarkerResult, - combine_stats, - stats_dict, -) -from json_benchmarks.benchmarker.process_context import ProcessContext -from json_benchmarks.benchmarker.result_analysis import ( - DEFAULT_METRIC_TO_OBJECTIVE, - Result, - ResultAnalysis, - SkillTracker, -) -from json_benchmarks.benchmarker.util_json import ( - ensure_json_serializable, - find_json_unserializable, - indexable_allclose, -) -from json_benchmarks.benchmarker.visualize import benchmark_analysis +from json_benchmarks.benchmarker import aggregate +from json_benchmarks.benchmarker import benchmarker +from json_benchmarks.benchmarker import process_context +from json_benchmarks.benchmarker import result_analysis +from json_benchmarks.benchmarker import util_json +from json_benchmarks.benchmarker import util_stats +from json_benchmarks.benchmarker import visualize -__all__ = [ - "Benchmarker", - "BenchmarkerConfig", - "BenchmarkerResult", - "DEFAULT_METRIC_TO_OBJECTIVE", - "ProcessContext", - "Result", - "ResultAnalysis", - "SkillTracker", - "aggregate", - "benchmark_analysis", - "benchmarker", - "combine_stats", - "demo", - "demo_data", - "ensure_json_serializable", - "find_json_unserializable", - "indexable_allclose", - "process_context", - "result_analysis", - "stats_dict", - "util_json", - "visualize", -] +from json_benchmarks.benchmarker.aggregate import (demo, demo_data,) +from json_benchmarks.benchmarker.benchmarker import (Benchmarker, + BenchmarkerConfig, + BenchmarkerResult,) +from json_benchmarks.benchmarker.process_context import (ProcessContext,) +from json_benchmarks.benchmarker.result_analysis import ( + DEFAULT_METRIC_TO_OBJECTIVE, Result, ResultAnalysis, SkillTracker,) +from json_benchmarks.benchmarker.util_json import (ensure_json_serializable, + find_json_unserializable, + indexable_allclose,) +from json_benchmarks.benchmarker.util_stats import (aggregate_stats, + combine_stats, + combine_stats_arrs, + stats_dict,) +from json_benchmarks.benchmarker.visualize import (benchmark_analysis,) + +__all__ = ['Benchmarker', 'BenchmarkerConfig', 'BenchmarkerResult', + 'DEFAULT_METRIC_TO_OBJECTIVE', 'ProcessContext', 'Result', + 'ResultAnalysis', 'SkillTracker', 'aggregate', 'aggregate_stats', + 'benchmark_analysis', 'benchmarker', 'combine_stats', + 'combine_stats_arrs', 'demo', 'demo_data', + 'ensure_json_serializable', 'find_json_unserializable', + 'indexable_allclose', 'process_context', 'result_analysis', + 'stats_dict', 'util_json', 'util_stats', 'visualize'] diff --git a/json_benchmarks/benchmarker/benchmarker.py b/json_benchmarks/benchmarker/benchmarker.py index 008ba82..24859ed 100644 --- a/json_benchmarks/benchmarker/benchmarker.py +++ b/json_benchmarks/benchmarker/benchmarker.py @@ -158,91 +158,12 @@ class Benchmarker: } rows.append(row) else: + from json_benchmarks.benchmarker import util_stats times = np.array(ti.robust_times()) - metrics = stats_dict(times, "_time") + metrics = util_stats.stats_dict(times, "_time") row = { "metrics": metrics, "params": params, "name": key, } rows.append(row) - - -def stats_dict(data, suffix=""): - stats = { - "nobs" + suffix: len(data), - "mean" + suffix: data.mean(), - "std" + suffix: data.std(), - "min" + suffix: data.min(), - "max" + suffix: data.max(), - } - return stats - - -def combine_stats(s1, s2): - """ - Helper for combining mean and standard deviation of multiple measurements - - Args: - s1 (dict): stats dict containing mean, std, and n - s2 (dict): stats dict containing mean, std, and n - - Example: - >>> basis = { - >>> 'nobs1': [1, 10, 100, 10000], - >>> 'nobs2': [1, 10, 100, 10000], - >>> } - >>> for params in ub.named_product(basis): - >>> data1 = np.random.rand(params['nobs1']) - >>> data2 = np.random.rand(params['nobs2']) - >>> data3 = np.hstack([data1, data2]) - >>> s1 = stats_dict(data1) - >>> s2 = stats_dict(data2) - >>> s3 = stats_dict(data3) - >>> # Check that our combo works - >>> combo_s3 = combine_stats(s1, s2) - >>> compare = pd.DataFrame({'raw': s3, 'combo': combo_s3}) - >>> print(compare) - >>> assert np.allclose(compare.raw, compare.combo) - - References: - https://stackoverflow.com/questions/7753002/adding-combining-standard-deviations - https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups - """ - stats = [s1, s2] - data = { - "nobs": np.array([s["nobs"] for s in stats]), - "mean": np.array([s["mean"] for s in stats]), - "std": np.array([s["std"] for s in stats]), - "min": np.array([s["min"] for s in stats]), - "max": np.array([s["max"] for s in stats]), - } - combine_stats_arrs(data) - - -def combine_stats_arrs(data): - sizes = data["nobs"] - means = data["mean"] - stds = data["std"] - mins = data["min"] - maxs = data["max"] - varis = stds * stds - - combo_size = sizes.sum() - combo_mean = (sizes * means).sum() / combo_size - - mean_deltas = means - combo_mean - - sv = (sizes * varis).sum() - sm = (sizes * (mean_deltas * mean_deltas)).sum() - combo_vars = (sv + sm) / combo_size - combo_std = np.sqrt(combo_vars) - - combo_stats = { - "nobs": combo_size, - "mean": combo_mean, - "std": combo_std, - "min": mins.min(), - "max": maxs.max(), - } - return combo_stats diff --git a/json_benchmarks/benchmarker/result_analysis.py b/json_benchmarks/benchmarker/result_analysis.py index 944d85e..81865e2 100644 --- a/json_benchmarks/benchmarker/result_analysis.py +++ b/json_benchmarks/benchmarker/result_analysis.py @@ -788,7 +788,7 @@ class ResultAnalysis(ub.NiceRepr): conclusions.append(txt) return conclusions - def plot(self, xlabel, metric_key, group_labels): + def plot(self, xlabel, metric_key, group_labels, **kwargs): """ Args: group_labels (dict): @@ -818,10 +818,10 @@ class ResultAnalysis(ub.NiceRepr): >>> 'hue': ['z'], >>> 'size': [], >>> } - >>> self.plot(xlabel, metric_key, group_labels) + >>> kwargs = {'xscale': 'log', 'yscale': 'log'} + >>> self.plot(xlabel, metric_key, group_labels, **kwargs) """ import seaborn as sns - sns.set() from matplotlib import pyplot as plt # NOQA @@ -903,12 +903,27 @@ class ResultAnalysis(ub.NiceRepr): # facet = sns.FacetGrid(group, **facet_kws) # facet.map_dataframe(sns.lineplot, x=xlabel, y=metric_key, **plot_kws) # facet.add_legend() - plots.append( - { - "fig": fig, - "facet": facet, - } - ) + + plot = { + "fig": fig, + "facet": facet, + } + plots.append(plot) + + for plot in plots: + xscale = kwargs.get('xscale', None) + yscale = kwargs.get('yscale', None) + for ax in plot['facet'].axes.ravel(): + if xscale is not None: + try: + ax.set_xscale(xscale) + except ValueError: + pass + if yscale is not None: + try: + ax.set_yscale(yscale) + except ValueError: + pass return plots diff --git a/json_benchmarks/benchmarker/util_stats.py b/json_benchmarks/benchmarker/util_stats.py new file mode 100644 index 0000000..3d12965 --- /dev/null +++ b/json_benchmarks/benchmarker/util_stats.py @@ -0,0 +1,235 @@ +import ubelt as ub +import numpy as np + +def __tabulate_issue(): + # MWE for tabulate issue + # The decimals are not aligned when using "," in the floatfmt + import tabulate + data = [ + [13213.2, 3213254.23, 432432.231,], + [432432., 432.3, 3.2] + ] + print(tabulate.tabulate(data, headers=['a', 'b'], floatfmt=',.02f')) + print(tabulate.tabulate(data, headers=['a', 'b'], floatfmt='.02f')) + + +def __groupby_issue(): + # MWE of an issue with pandas groupby + import pandas as pd + data = pd.DataFrame([ + {'p1': 'a', 'p2': 1, 'p3': 0}, + {'p1': 'a', 'p2': 1, 'p3': 0}, + {'p1': 'a', 'p2': 2, 'p3': 0}, + {'p1': 'b', 'p2': 2, 'p3': 0}, + {'p1': 'b', 'p2': 1, 'p3': 0}, + {'p1': 'b', 'p2': 1, 'p3': 0}, + {'p1': 'b', 'p2': 1, 'p3': 0}, + ]) + + by = 'p1' + key = list(data.groupby(by))[0][0] + result = { + 'by': by, + 'key': key, + 'type(key)': type(key) + } + print('result = {}'.format(ub.repr2(result, nl=1))) + assert not ub.iterable(key), ( + '`by` is specified as a scalar, so getting `key` as a scalar makes sense') + + by = ['p1'] + key = list(data.groupby(by))[0][0] + result = { + 'by': by, + 'key': key, + 'type(key)': type(key) + } + print('result = {}'.format(ub.repr2(result, nl=1))) + assert not ub.iterable(key), ( + '`by` is specified as a list of scalars (with one element), but we ' + 'still get `key` as a scalar. This does not make sense') + + by = ['p1', 'p2'] + key = list(data.groupby(by))[0][0] + result = { + 'by': by, + 'key': key, + 'type(key)': type(key) + } + print('result = {}'.format(ub.repr2(result, nl=1))) + assert ub.iterable(key), ( + '`by` is specified as a list of scalars (with multiple elements), ' + 'and we still get `key` as a tuple of values. This makes sense') + + +def aggregate_stats(data, suffix='', group_keys=None): + """ + Given columns interpreted as containing stats, aggregate those stats + within each group. For each row, any non-group, non-stat column + with consistent values across that columns in the group is kept as-is, + otherwise the new column for that row is set to None. + + Args: + data (DataFrame): + a data frame with columns: 'mean', 'std', 'min', 'max', and 'nobs' + (possibly with a suffix) + + suffix (str): + if the nobs, std, mean, min, and max have a suffix, specify it + + group_keys (List[str]): + pass + + Returns: + DataFrame: + New dataframe where grouped rows have been aggregated into a single + row. + + Example: + >>> import sys, ubelt + >>> sys.path.append(ubelt.expandpath('~/code/ultrajson')) + >>> from json_benchmarks.benchmarker.util_stats import * # NOQA + >>> import pandas as pd + >>> data = pd.DataFrame([ + >>> # + >>> {'mean': 8, 'std': 1, 'min': 0, 'max': 1, 'nobs': 2, 'p1': 'a', 'p2': 1}, + >>> {'mean': 6, 'std': 2, 'min': 0, 'max': 1, 'nobs': 3, 'p1': 'a', 'p2': 1}, + >>> {'mean': 7, 'std': 3, 'min': 0, 'max': 2, 'nobs': 5, 'p1': 'a', 'p2': 2}, + >>> {'mean': 5, 'std': 4, 'min': 0, 'max': 3, 'nobs': 7, 'p1': 'a', 'p2': 1}, + >>> # + >>> {'mean': 3, 'std': 1, 'min': 0, 'max': 20, 'nobs': 6, 'p1': 'b', 'p2': 1}, + >>> {'mean': 0, 'std': 2, 'min': 0, 'max': 20, 'nobs': 26, 'p1': 'b', 'p2': 2}, + >>> {'mean': 9, 'std': 3, 'min': 0, 'max': 20, 'nobs': 496, 'p1': 'b', 'p2': 1}, + >>> # + >>> {'mean': 5, 'std': 0, 'min': 0, 'max': 1, 'nobs': 2, 'p1': 'c', 'p2': 2}, + >>> {'mean': 5, 'std': 0, 'min': 0, 'max': 1, 'nobs': 7, 'p1': 'c', 'p2': 2}, + >>> # + >>> {'mean': 5, 'std': 2, 'min': 0, 'max': 2, 'nobs': 7, 'p1': 'd', 'p2': 2}, + >>> # + >>> {'mean': 5, 'std': 2, 'min': 0, 'max': 2, 'nobs': 7, 'p1': 'e', 'p2': 1}, + >>> ]) + >>> print(data) + >>> new_data = aggregate_stats(data) + >>> print(new_data) + >>> new_data1 = aggregate_stats(data, group_keys=['p1']) + >>> print(new_data1) + >>> new_data2 = aggregate_stats(data, group_keys=['p2']) + >>> print(new_data2) + """ + import pandas as pd + + # Stats groupings + raw_stats_cols = ["nobs", "std", "mean", "max", "min"] + stats_cols = [c + suffix for c in raw_stats_cols] + mapper = dict(zip(stats_cols, raw_stats_cols)) + unmapper = dict(zip(raw_stats_cols, stats_cols)) + non_stats_cols = list(ub.oset(data.columns) - stats_cols) + if group_keys is None: + group_keys = non_stats_cols + non_group_keys = list(ub.oset(non_stats_cols) - group_keys) + + new_rows = [] + for group_vals, group in list(data.groupby(group_keys)): + # hack, is this a pandas bug in 1.4.1? Is it fixed? (Not in 1.4.2) + if isinstance(group_keys, list) and len(group_keys) == 1: + # For some reason, when we specify group keys as a list of one + # element, we get a squeezed value out + group_vals = (group_vals,) + stat_data = group[stats_cols].rename(mapper, axis=1) + new_stats = combine_stats_arrs(stat_data) + new_time_stats = ub.map_keys(unmapper, new_stats) + new_row = ub.dzip(group_keys, group_vals) + if non_group_keys: + for k in non_group_keys: + unique_vals = group[k].unique() + if len(unique_vals) == 1: + new_row[k] = unique_vals[0] + else: + new_row[k] = None + new_row.update(new_time_stats) + new_rows.append(new_row) + new_data = pd.DataFrame(new_rows) + return new_data + + +def stats_dict(data, suffix=""): + stats = { + "nobs" + suffix: len(data), + "mean" + suffix: data.mean(), + "std" + suffix: data.std(), + "min" + suffix: data.min(), + "max" + suffix: data.max(), + } + return stats + + +def combine_stats(s1, s2): + """ + Helper for combining mean and standard deviation of multiple measurements + + Args: + s1 (dict): stats dict containing mean, std, and n + s2 (dict): stats dict containing mean, std, and n + + Example: + >>> basis = { + >>> 'nobs1': [1, 10, 100, 10000], + >>> 'nobs2': [1, 10, 100, 10000], + >>> } + >>> for params in ub.named_product(basis): + >>> data1 = np.random.rand(params['nobs1']) + >>> data2 = np.random.rand(params['nobs2']) + >>> data3 = np.hstack([data1, data2]) + >>> s1 = stats_dict(data1) + >>> s2 = stats_dict(data2) + >>> s3 = stats_dict(data3) + >>> # Check that our combo works + >>> combo_s3 = combine_stats(s1, s2) + >>> compare = pd.DataFrame({'raw': s3, 'combo': combo_s3}) + >>> print(compare) + >>> assert np.allclose(compare.raw, compare.combo) + + References: + https://stackoverflow.com/questions/7753002/adding-combining-standard-deviations + https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups + """ + stats = [s1, s2] + data = { + "nobs": np.array([s["nobs"] for s in stats]), + "mean": np.array([s["mean"] for s in stats]), + "std": np.array([s["std"] for s in stats]), + "min": np.array([s["min"] for s in stats]), + "max": np.array([s["max"] for s in stats]), + } + combine_stats_arrs(data) + + +def combine_stats_arrs(data): + sizes = data["nobs"] + means = data["mean"] + stds = data["std"] + mins = data["min"] + maxs = data["max"] + varis = stds * stds + + # TODO: ddof + # https://github.com/Erotemic/misc/blob/28cf797b9b0f8bd82e3ebee2f6d0a688ecee2838/learn/stats.py#L128 + + combo_size = sizes.sum() + combo_mean = (sizes * means).sum() / combo_size + + mean_deltas = means - combo_mean + + sv = (sizes * varis).sum() + sm = (sizes * (mean_deltas * mean_deltas)).sum() + combo_vars = (sv + sm) / combo_size + combo_std = np.sqrt(combo_vars) + + combo_stats = { + "nobs": combo_size, + "mean": combo_mean, + "std": combo_std, + "min": mins.min(), + "max": maxs.max(), + } + return combo_stats diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py index 306bf67..18617c7 100644 --- a/json_benchmarks/core.py +++ b/json_benchmarks/core.py @@ -7,6 +7,7 @@ import scriptconfig as scfg import ubelt as ub from json_benchmarks import benchmarker, datagen +from json_benchmarks.benchmarker import util_stats KNOWN_LIBRARIES = [ "ujson", @@ -23,31 +24,50 @@ class JSONBenchmarkConfig(scfg.Config): """ default = { + "mode": scfg.Value( + "all", + position=1, + choices=["all", "single", "run", "analyze"], + help=ub.paragraph( + """ + By default all benchmarks are run, saved, and aggregated + with any other existing benchmarks for analysis and + visualization. + + In "single" mode, other existing benchmarks are ignord. + + In "run" mode, the benchmarks are run, but no analysis is done. + + In "analyze" mode, no benchmarks are run, but any existing + benchmarks are loaded for analysis and visualization. + """) + ), + "disable": scfg.Value( [], choices=KNOWN_LIBRARIES, help=ub.paragraph( """ - Remove specified libraries from the benchmarks - """ + Remove specified libraries from the benchmarks + """ ), ), "factor": scfg.Value( 1.0, help=ub.paragraph( """ - Specify as a fraction to speed up benchmarks for development / - testing - """ + Specify as a fraction to speed up benchmarks for development / + testing + """ ), ), "cache_dir": scfg.Value( None, help=ub.paragraph( """ - Location for benchmark cache. - Defaults to $XDG_CACHE/ujson/benchmark_results/ - """ + Location for benchmark cache. + Defaults to $XDG_CACHE/ujson/benchmark_results/ + """ ), ), } @@ -62,8 +82,7 @@ class JSONBenchmarkConfig(scfg.Config): def available_json_impls(): import importlib - - known_modnames = ["ujson", "json", "nujson", "orjson", "simplejson"] + known_modnames = KNOWN_LIBRARIES json_impls = {} for libname in known_modnames: try: @@ -116,8 +135,8 @@ def benchmark_json(): # serializing the results, and aggregating results from disparate runs. benchmark = benchmarker.Benchmarker( name="bench_json", - num=100, - bestof=10, + num=1000, + bestof=100, verbose=3, basis=basis, ) @@ -156,7 +175,7 @@ def benchmark_json(): return result_fpath -def aggregate_results(result_fpaths): +def analyze_results(result_fpaths): import json results = [] @@ -185,65 +204,28 @@ def aggregate_results(result_fpaths): table = analysis.table - def aggregate_time_stats(data, group_keys=None): - """ - Given columns interpreted as containing stats, aggregate those stats - within each group. For each row, any non-group, non-stat column - with consistent values across that columns in the group is kept as-is, - otherwise the new column for that row is set to None. - """ - import pandas as pd - - # Stats groupings - stats_cols = [ - "nobs_time", - "std_time", - "mean_time", - "max_time", - "min_time", - ] - mapper = {c: c.replace("_time", "") for c in stats_cols} - unmapper = ub.invert_dict(mapper) - non_stats_cols = list(ub.oset(data.columns) - stats_cols) - if group_keys is None: - group_keys = non_stats_cols - non_group_keys = list(ub.oset(non_stats_cols) - group_keys) - from json_benchmarks.benchmarker.benchmarker import combine_stats_arrs - - new_rows = [] - for group_vals, group in list(data.groupby(group_keys)): - # hack, is this a pandas bug in 1.4.1? Is it fixed - if isinstance(group_keys, list) and not isinstance(group_vals, list): - group_vals = [group_vals] - stat_data = group[stats_cols].rename(mapper, axis=1) - new_stats = combine_stats_arrs(stat_data) - new_time_stats = ub.map_keys(unmapper, new_stats) - new_row = ub.dzip(group_keys, group_vals) - if non_group_keys: - for k in non_group_keys: - unique_vals = group[k].unique() - if len(unique_vals) == 1: - new_row[k] = unique_vals[0] - else: - new_row[k] = None - new_row.update(new_time_stats) - new_rows.append(new_row) - new_data = pd.DataFrame(new_rows) - return new_data - single_size = table[(table["size"] == 256) | table["size"].isnull()] - # single_size_combo = aggregate_time_stats(single_size, None) - single_size_combo = aggregate_time_stats(single_size, ["name"]) + # single_size_combo = aggregate_stats(single_size, None) + single_size_combo = util_stats.aggregate_stats(single_size, suffix='_time', group_keys=["name"]) param_group = ["impl", "impl_version"] single_size_combo["calls/sec"] = 1 / single_size_combo["mean_time"] - _single_size_combo = single_size_combo.copy() - _single_size_combo["calls/sec"] = _single_size_combo["calls/sec"].apply( - lambda x: f"{x:,.02f}" - ) - piv = _single_size_combo.pivot(["input", "func"], param_group, "calls/sec") + # _single_size_combo = single_size_combo.copy() + # _single_size_combo["calls/sec"] = _single_size_combo["calls/sec"].apply( + # + # ) + time_piv = single_size_combo.pivot(["input", "func"], param_group, "mean_time") + + hz_piv = (1 / time_piv) + # hzstr_piv = (1 / time_piv).applymap(lambda x: f"{x:,.02f}") print("Table for size=256") - print(piv) + # print(hzstr_piv.to_markdown()) + print(hz_piv.to_markdown(floatfmt=',.02f')) + print("") + print("Above metrics are in call/sec, larger is better.") + + speedup_piv = hz_piv / hz_piv['json'].values + print(speedup_piv.to_markdown(floatfmt=',.02g')) analysis.abalate(param_group) # benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL) @@ -253,35 +235,46 @@ def aggregate_results(result_fpaths): group_labels = { "fig": ["input"], "col": ["func"], + # "fig": [], + # "col": ["func" "input"], "hue": ["impl", "impl_version"], "size": [], } import kwplot - kwplot.autosns() - plots = analysis.plot(xlabel, metric_key, group_labels) - for plot in plots: - for ax in plot["facet"].axes.ravel(): - ax.set_xscale("log") - ax.set_yscale("log") + plots = analysis.plot( + xlabel, metric_key, group_labels, + xscale='log', yscale='log', + ) + plots kwplot.show_if_requested() -def main(): - from json_benchmarks import core - - config = core.JSONBenchmarkConfig(cmdline=True) +def main(cmdline=True, **kwargs): + """ + Example: + >>> import sys, ubelt + >>> sys.path.append(ubelt.expandpath('~/code/ultrajson')) + >>> from json_benchmarks.core import * # NOQA + >>> import kwplot + >>> kwplot.autosns() + >>> cmdline = False + >>> kwargs = {} + >>> main(cmdline, **kwargs) + """ + config = JSONBenchmarkConfig(cmdline=cmdline, data=kwargs) dpath = config["cache_dir"] - run = 1 + run = config['mode'] in {'all', 'single', 'run'} if run: - result_fpath = core.benchmark_json() + result_fpath = benchmark_json() print(f"result_fpath = {result_fpath!r}") result_fpaths = [result_fpath] - agg = 1 + agg = config['mode'] not in {'single'} if agg: result_fpaths = list(dpath.glob("benchmarks*.json")) - core.aggregate_results(result_fpaths) - # results_output_table(libraries) + analyze = config['mode'] in {'all', 'single', 'analyze'} + if analyze: + analyze_results(result_fpaths) From 470f440f3f0fd269e339978caa56a21e382f8379 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 29 May 2022 01:56:39 +0000 Subject: [PATCH 10/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- json_benchmarks/benchmarker/__init__.py | 90 ++++++++++++------- json_benchmarks/benchmarker/benchmarker.py | 1 + .../benchmarker/result_analysis.py | 7 +- json_benchmarks/benchmarker/util_stats.py | 88 +++++++++--------- json_benchmarks/core.py | 31 ++++--- 5 files changed, 128 insertions(+), 89 deletions(-) diff --git a/json_benchmarks/benchmarker/__init__.py b/json_benchmarks/benchmarker/__init__.py index 5614b61..a278879 100644 --- a/json_benchmarks/benchmarker/__init__.py +++ b/json_benchmarks/benchmarker/__init__.py @@ -9,35 +9,65 @@ mkinit ~/code/ultrajson/json_benchmarks/benchmarker/__init__.py -w __version__ = "0.1.0" -from json_benchmarks.benchmarker import aggregate -from json_benchmarks.benchmarker import benchmarker -from json_benchmarks.benchmarker import process_context -from json_benchmarks.benchmarker import result_analysis -from json_benchmarks.benchmarker import util_json -from json_benchmarks.benchmarker import util_stats -from json_benchmarks.benchmarker import visualize - -from json_benchmarks.benchmarker.aggregate import (demo, demo_data,) -from json_benchmarks.benchmarker.benchmarker import (Benchmarker, - BenchmarkerConfig, - BenchmarkerResult,) -from json_benchmarks.benchmarker.process_context import (ProcessContext,) +from json_benchmarks.benchmarker import ( + aggregate, + benchmarker, + process_context, + result_analysis, + util_json, + util_stats, + visualize, +) +from json_benchmarks.benchmarker.aggregate import demo, demo_data +from json_benchmarks.benchmarker.benchmarker import ( + Benchmarker, + BenchmarkerConfig, + BenchmarkerResult, +) +from json_benchmarks.benchmarker.process_context import ProcessContext from json_benchmarks.benchmarker.result_analysis import ( - DEFAULT_METRIC_TO_OBJECTIVE, Result, ResultAnalysis, SkillTracker,) -from json_benchmarks.benchmarker.util_json import (ensure_json_serializable, - find_json_unserializable, - indexable_allclose,) -from json_benchmarks.benchmarker.util_stats import (aggregate_stats, - combine_stats, - combine_stats_arrs, - stats_dict,) -from json_benchmarks.benchmarker.visualize import (benchmark_analysis,) + DEFAULT_METRIC_TO_OBJECTIVE, + Result, + ResultAnalysis, + SkillTracker, +) +from json_benchmarks.benchmarker.util_json import ( + ensure_json_serializable, + find_json_unserializable, + indexable_allclose, +) +from json_benchmarks.benchmarker.util_stats import ( + aggregate_stats, + combine_stats, + combine_stats_arrs, + stats_dict, +) +from json_benchmarks.benchmarker.visualize import benchmark_analysis -__all__ = ['Benchmarker', 'BenchmarkerConfig', 'BenchmarkerResult', - 'DEFAULT_METRIC_TO_OBJECTIVE', 'ProcessContext', 'Result', - 'ResultAnalysis', 'SkillTracker', 'aggregate', 'aggregate_stats', - 'benchmark_analysis', 'benchmarker', 'combine_stats', - 'combine_stats_arrs', 'demo', 'demo_data', - 'ensure_json_serializable', 'find_json_unserializable', - 'indexable_allclose', 'process_context', 'result_analysis', - 'stats_dict', 'util_json', 'util_stats', 'visualize'] +__all__ = [ + "Benchmarker", + "BenchmarkerConfig", + "BenchmarkerResult", + "DEFAULT_METRIC_TO_OBJECTIVE", + "ProcessContext", + "Result", + "ResultAnalysis", + "SkillTracker", + "aggregate", + "aggregate_stats", + "benchmark_analysis", + "benchmarker", + "combine_stats", + "combine_stats_arrs", + "demo", + "demo_data", + "ensure_json_serializable", + "find_json_unserializable", + "indexable_allclose", + "process_context", + "result_analysis", + "stats_dict", + "util_json", + "util_stats", + "visualize", +] diff --git a/json_benchmarks/benchmarker/benchmarker.py b/json_benchmarks/benchmarker/benchmarker.py index 24859ed..7a0d4fa 100644 --- a/json_benchmarks/benchmarker/benchmarker.py +++ b/json_benchmarks/benchmarker/benchmarker.py @@ -159,6 +159,7 @@ class Benchmarker: rows.append(row) else: from json_benchmarks.benchmarker import util_stats + times = np.array(ti.robust_times()) metrics = util_stats.stats_dict(times, "_time") row = { diff --git a/json_benchmarks/benchmarker/result_analysis.py b/json_benchmarks/benchmarker/result_analysis.py index 81865e2..846a046 100644 --- a/json_benchmarks/benchmarker/result_analysis.py +++ b/json_benchmarks/benchmarker/result_analysis.py @@ -822,6 +822,7 @@ class ResultAnalysis(ub.NiceRepr): >>> self.plot(xlabel, metric_key, group_labels, **kwargs) """ import seaborn as sns + sns.set() from matplotlib import pyplot as plt # NOQA @@ -911,9 +912,9 @@ class ResultAnalysis(ub.NiceRepr): plots.append(plot) for plot in plots: - xscale = kwargs.get('xscale', None) - yscale = kwargs.get('yscale', None) - for ax in plot['facet'].axes.ravel(): + xscale = kwargs.get("xscale", None) + yscale = kwargs.get("yscale", None) + for ax in plot["facet"].axes.ravel(): if xscale is not None: try: ax.set_xscale(xscale) diff --git a/json_benchmarks/benchmarker/util_stats.py b/json_benchmarks/benchmarker/util_stats.py index 3d12965..2eaa32c 100644 --- a/json_benchmarks/benchmarker/util_stats.py +++ b/json_benchmarks/benchmarker/util_stats.py @@ -1,68 +1,68 @@ -import ubelt as ub import numpy as np +import ubelt as ub + def __tabulate_issue(): # MWE for tabulate issue # The decimals are not aligned when using "," in the floatfmt import tabulate + data = [ - [13213.2, 3213254.23, 432432.231,], - [432432., 432.3, 3.2] + [ + 13213.2, + 3213254.23, + 432432.231, + ], + [432432.0, 432.3, 3.2], ] - print(tabulate.tabulate(data, headers=['a', 'b'], floatfmt=',.02f')) - print(tabulate.tabulate(data, headers=['a', 'b'], floatfmt='.02f')) + print(tabulate.tabulate(data, headers=["a", "b"], floatfmt=",.02f")) + print(tabulate.tabulate(data, headers=["a", "b"], floatfmt=".02f")) def __groupby_issue(): # MWE of an issue with pandas groupby import pandas as pd - data = pd.DataFrame([ - {'p1': 'a', 'p2': 1, 'p3': 0}, - {'p1': 'a', 'p2': 1, 'p3': 0}, - {'p1': 'a', 'p2': 2, 'p3': 0}, - {'p1': 'b', 'p2': 2, 'p3': 0}, - {'p1': 'b', 'p2': 1, 'p3': 0}, - {'p1': 'b', 'p2': 1, 'p3': 0}, - {'p1': 'b', 'p2': 1, 'p3': 0}, - ]) - by = 'p1' + data = pd.DataFrame( + [ + {"p1": "a", "p2": 1, "p3": 0}, + {"p1": "a", "p2": 1, "p3": 0}, + {"p1": "a", "p2": 2, "p3": 0}, + {"p1": "b", "p2": 2, "p3": 0}, + {"p1": "b", "p2": 1, "p3": 0}, + {"p1": "b", "p2": 1, "p3": 0}, + {"p1": "b", "p2": 1, "p3": 0}, + ] + ) + + by = "p1" key = list(data.groupby(by))[0][0] - result = { - 'by': by, - 'key': key, - 'type(key)': type(key) - } - print('result = {}'.format(ub.repr2(result, nl=1))) + result = {"by": by, "key": key, "type(key)": type(key)} + print(f"result = {ub.repr2(result, nl=1)}") + assert not ub.iterable( + key + ), "`by` is specified as a scalar, so getting `key` as a scalar makes sense" + + by = ["p1"] + key = list(data.groupby(by))[0][0] + result = {"by": by, "key": key, "type(key)": type(key)} + print(f"result = {ub.repr2(result, nl=1)}") assert not ub.iterable(key), ( - '`by` is specified as a scalar, so getting `key` as a scalar makes sense') + "`by` is specified as a list of scalars (with one element), but we " + "still get `key` as a scalar. This does not make sense" + ) - by = ['p1'] + by = ["p1", "p2"] key = list(data.groupby(by))[0][0] - result = { - 'by': by, - 'key': key, - 'type(key)': type(key) - } - print('result = {}'.format(ub.repr2(result, nl=1))) - assert not ub.iterable(key), ( - '`by` is specified as a list of scalars (with one element), but we ' - 'still get `key` as a scalar. This does not make sense') - - by = ['p1', 'p2'] - key = list(data.groupby(by))[0][0] - result = { - 'by': by, - 'key': key, - 'type(key)': type(key) - } - print('result = {}'.format(ub.repr2(result, nl=1))) + result = {"by": by, "key": key, "type(key)": type(key)} + print(f"result = {ub.repr2(result, nl=1)}") assert ub.iterable(key), ( - '`by` is specified as a list of scalars (with multiple elements), ' - 'and we still get `key` as a tuple of values. This makes sense') + "`by` is specified as a list of scalars (with multiple elements), " + "and we still get `key` as a tuple of values. This makes sense" + ) -def aggregate_stats(data, suffix='', group_keys=None): +def aggregate_stats(data, suffix="", group_keys=None): """ Given columns interpreted as containing stats, aggregate those stats within each group. For each row, any non-group, non-stat column diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py index 18617c7..5fa57aa 100644 --- a/json_benchmarks/core.py +++ b/json_benchmarks/core.py @@ -40,9 +40,9 @@ class JSONBenchmarkConfig(scfg.Config): In "analyze" mode, no benchmarks are run, but any existing benchmarks are loaded for analysis and visualization. - """) + """ + ), ), - "disable": scfg.Value( [], choices=KNOWN_LIBRARIES, @@ -82,6 +82,7 @@ class JSONBenchmarkConfig(scfg.Config): def available_json_impls(): import importlib + known_modnames = KNOWN_LIBRARIES json_impls = {} for libname in known_modnames: @@ -206,7 +207,9 @@ def analyze_results(result_fpaths): single_size = table[(table["size"] == 256) | table["size"].isnull()] # single_size_combo = aggregate_stats(single_size, None) - single_size_combo = util_stats.aggregate_stats(single_size, suffix='_time', group_keys=["name"]) + single_size_combo = util_stats.aggregate_stats( + single_size, suffix="_time", group_keys=["name"] + ) param_group = ["impl", "impl_version"] single_size_combo["calls/sec"] = 1 / single_size_combo["mean_time"] @@ -216,16 +219,16 @@ def analyze_results(result_fpaths): # ) time_piv = single_size_combo.pivot(["input", "func"], param_group, "mean_time") - hz_piv = (1 / time_piv) + hz_piv = 1 / time_piv # hzstr_piv = (1 / time_piv).applymap(lambda x: f"{x:,.02f}") print("Table for size=256") # print(hzstr_piv.to_markdown()) - print(hz_piv.to_markdown(floatfmt=',.02f')) + print(hz_piv.to_markdown(floatfmt=",.02f")) print("") print("Above metrics are in call/sec, larger is better.") - speedup_piv = hz_piv / hz_piv['json'].values - print(speedup_piv.to_markdown(floatfmt=',.02g')) + speedup_piv = hz_piv / hz_piv["json"].values + print(speedup_piv.to_markdown(floatfmt=",.02g")) analysis.abalate(param_group) # benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL) @@ -241,10 +244,14 @@ def analyze_results(result_fpaths): "size": [], } import kwplot + kwplot.autosns() plots = analysis.plot( - xlabel, metric_key, group_labels, - xscale='log', yscale='log', + xlabel, + metric_key, + group_labels, + xscale="log", + yscale="log", ) plots kwplot.show_if_requested() @@ -265,16 +272,16 @@ def main(cmdline=True, **kwargs): config = JSONBenchmarkConfig(cmdline=cmdline, data=kwargs) dpath = config["cache_dir"] - run = config['mode'] in {'all', 'single', 'run'} + run = config["mode"] in {"all", "single", "run"} if run: result_fpath = benchmark_json() print(f"result_fpath = {result_fpath!r}") result_fpaths = [result_fpath] - agg = config['mode'] not in {'single'} + agg = config["mode"] not in {"single"} if agg: result_fpaths = list(dpath.glob("benchmarks*.json")) - analyze = config['mode'] in {'all', 'single', 'analyze'} + analyze = config["mode"] in {"all", "single", "analyze"} if analyze: analyze_results(result_fpaths) From 3b29b746e39c71157f77a67c2b429c9b392e3efa Mon Sep 17 00:00:00 2001 From: joncrall Date: Sat, 28 May 2022 22:12:02 -0400 Subject: [PATCH 11/25] wip --- json_benchmarks/benchmarker/result_analysis.py | 12 ++++++++++-- json_benchmarks/core.py | 4 +++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/json_benchmarks/benchmarker/result_analysis.py b/json_benchmarks/benchmarker/result_analysis.py index 81865e2..4bc52e1 100644 --- a/json_benchmarks/benchmarker/result_analysis.py +++ b/json_benchmarks/benchmarker/result_analysis.py @@ -821,12 +821,17 @@ class ResultAnalysis(ub.NiceRepr): >>> kwargs = {'xscale': 'log', 'yscale': 'log'} >>> self.plot(xlabel, metric_key, group_labels, **kwargs) """ + print('Init seaborn and pyplot') import seaborn as sns sns.set() from matplotlib import pyplot as plt # NOQA + print('Starting plot') + data = self.table data = data.sort_values(metric_key) + + print('Compute group labels') for gname, labels in group_labels.items(): if len(labels): new_col = [] @@ -880,6 +885,7 @@ class ResultAnalysis(ub.NiceRepr): plots = [] base_fnum = 1 + print('Start plots') for fnum, (fig_key, group) in enumerate(groups, start=base_fnum): # TODO: seaborn doesn't give us any option to reuse an existing # figure or even specify what it's handle should be. A patch should @@ -891,8 +897,8 @@ class ResultAnalysis(ub.NiceRepr): facet = sns.relplot( data=group, - # kind='line', - kind="scatter", + kind='line', + # kind="scatter", facet_kws=facet_kws, **plot_kws, ) @@ -910,6 +916,7 @@ class ResultAnalysis(ub.NiceRepr): } plots.append(plot) + print('Adjust plots') for plot in plots: xscale = kwargs.get('xscale', None) yscale = kwargs.get('yscale', None) @@ -924,6 +931,7 @@ class ResultAnalysis(ub.NiceRepr): ax.set_yscale(yscale) except ValueError: pass + print('Finish') return plots diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py index 18617c7..7ada55a 100644 --- a/json_benchmarks/core.py +++ b/json_benchmarks/core.py @@ -179,7 +179,7 @@ def analyze_results(result_fpaths): import json results = [] - for fpath in result_fpaths: + for fpath in ub.ProgIter(result_fpaths, desc='load results'): data = json.loads(fpath.read_text()) for row in data["rows"]: result = benchmarker.BenchmarkerResult.load(fpath) @@ -242,6 +242,8 @@ def analyze_results(result_fpaths): } import kwplot kwplot.autosns() + self = analysis + plots = analysis.plot( xlabel, metric_key, group_labels, xscale='log', yscale='log', From 283b5e5f9ba1804d8f2f6e37c2655d4f59af4553 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 29 May 2022 02:12:17 +0000 Subject: [PATCH 12/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- json_benchmarks/benchmarker/result_analysis.py | 14 +++++++------- json_benchmarks/core.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/json_benchmarks/benchmarker/result_analysis.py b/json_benchmarks/benchmarker/result_analysis.py index ade9f36..da9ce05 100644 --- a/json_benchmarks/benchmarker/result_analysis.py +++ b/json_benchmarks/benchmarker/result_analysis.py @@ -821,18 +821,18 @@ class ResultAnalysis(ub.NiceRepr): >>> kwargs = {'xscale': 'log', 'yscale': 'log'} >>> self.plot(xlabel, metric_key, group_labels, **kwargs) """ - print('Init seaborn and pyplot') + print("Init seaborn and pyplot") import seaborn as sns sns.set() from matplotlib import pyplot as plt # NOQA - print('Starting plot') + print("Starting plot") data = self.table data = data.sort_values(metric_key) - print('Compute group labels') + print("Compute group labels") for gname, labels in group_labels.items(): if len(labels): new_col = [] @@ -886,7 +886,7 @@ class ResultAnalysis(ub.NiceRepr): plots = [] base_fnum = 1 - print('Start plots') + print("Start plots") for fnum, (fig_key, group) in enumerate(groups, start=base_fnum): # TODO: seaborn doesn't give us any option to reuse an existing # figure or even specify what it's handle should be. A patch should @@ -898,7 +898,7 @@ class ResultAnalysis(ub.NiceRepr): facet = sns.relplot( data=group, - kind='line', + kind="line", # kind="scatter", facet_kws=facet_kws, **plot_kws, @@ -917,7 +917,7 @@ class ResultAnalysis(ub.NiceRepr): } plots.append(plot) - print('Adjust plots') + print("Adjust plots") for plot in plots: xscale = kwargs.get("xscale", None) yscale = kwargs.get("yscale", None) @@ -932,7 +932,7 @@ class ResultAnalysis(ub.NiceRepr): ax.set_yscale(yscale) except ValueError: pass - print('Finish') + print("Finish") return plots diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py index 509e0c2..69b38c0 100644 --- a/json_benchmarks/core.py +++ b/json_benchmarks/core.py @@ -180,7 +180,7 @@ def analyze_results(result_fpaths): import json results = [] - for fpath in ub.ProgIter(result_fpaths, desc='load results'): + for fpath in ub.ProgIter(result_fpaths, desc="load results"): data = json.loads(fpath.read_text()) for row in data["rows"]: result = benchmarker.BenchmarkerResult.load(fpath) From 03ae1b85459dd234dec3803debbd9aa6061fb89e Mon Sep 17 00:00:00 2001 From: joncrall Date: Sat, 28 May 2022 23:04:04 -0400 Subject: [PATCH 13/25] use aggregate mean std to plot errors --- .../benchmarker/result_analysis.py | 95 +++++++++++++++---- json_benchmarks/core.py | 7 +- 2 files changed, 83 insertions(+), 19 deletions(-) diff --git a/json_benchmarks/benchmarker/result_analysis.py b/json_benchmarks/benchmarker/result_analysis.py index da9ce05..9f1730a 100644 --- a/json_benchmarks/benchmarker/result_analysis.py +++ b/json_benchmarks/benchmarker/result_analysis.py @@ -788,7 +788,7 @@ class ResultAnalysis(ub.NiceRepr): conclusions.append(txt) return conclusions - def plot(self, xlabel, metric_key, group_labels, **kwargs): + def plot(self, xlabel, metric_key, group_labels, data=None, **kwargs): """ Args: group_labels (dict): @@ -829,7 +829,8 @@ class ResultAnalysis(ub.NiceRepr): print("Starting plot") - data = self.table + if data is None: + data = self.table data = data.sort_values(metric_key) print("Compute group labels") @@ -884,10 +885,15 @@ class ResultAnalysis(ub.NiceRepr): palette = ub.dzip(unique_hues, sns.color_palette(n_colors=len(unique_hues))) plot_kws["palette"] = palette + # kwplot.close_figures() + plots = [] base_fnum = 1 print("Start plots") - for fnum, (fig_key, group) in enumerate(groups, start=base_fnum): + # hack + hack_groups = [(k, v) for k, v in groups if k != "input=Complex object"] + + for fnum, (fig_key, group) in enumerate(hack_groups, start=base_fnum): # TODO: seaborn doesn't give us any option to reuse an existing # figure or even specify what it's handle should be. A patch should # be submitted to add that feature, but in the meantime work around @@ -903,6 +909,50 @@ class ResultAnalysis(ub.NiceRepr): facet_kws=facet_kws, **plot_kws, ) + from json_benchmarks.benchmarker.util_stats import aggregate_stats + + facet_data_groups = dict(list(facet.data.groupby(facet._col_var))) + # facet_data_group_iter = iter(facet_data_groups.keys()) + + for ax in facet.axes.ravel(): + col_key = ax.get_title().split('=', 1)[-1].strip() + # col_key = next(facet_data_group_iter) + col_data = facet_data_groups[col_key] + col_data['mean_time'] + col_data['std_time'] + xlabel = plot_kws['x'] + ylabel = plot_kws['y'] + subgroups = col_data.groupby(plot_kws['hue']) + for subgroup_key, subgroup in subgroups: + # combine stds in multiple groups on the x and manually draw errors + suffix = '_' + ylabel.partition('_')[2] + if 'mean_' in ylabel: + std_label = ylabel.replace('mean_', 'std_') + combo_group = aggregate_stats(subgroup, suffix=suffix, group_keys=[plot_kws['x']]) + _xdata = combo_group[xlabel].values + _ydata_mean = combo_group[ylabel].values + _ydata_std = combo_group[std_label].values + std_label = ylabel.replace('mean_', 'std_') + y_data_min = _ydata_mean - _ydata_std + y_data_max = _ydata_mean + _ydata_std + spread_alpha = 0.3 + color = palette[subgroup_key] + ax.fill_between(_xdata, y_data_min, y_data_max, alpha=spread_alpha, color=color, zorder=1) + # zorder=0) + + xscale = kwargs.get("xscale", None) + yscale = kwargs.get("yscale", None) + for ax in facet.axes.ravel(): + if xscale is not None: + try: + ax.set_xscale(xscale) + except ValueError: + pass + if yscale is not None: + try: + ax.set_yscale(yscale) + except ValueError: + pass fig = facet.figure fig.suptitle(fig_key) @@ -917,21 +967,30 @@ class ResultAnalysis(ub.NiceRepr): } plots.append(plot) - print("Adjust plots") - for plot in plots: - xscale = kwargs.get("xscale", None) - yscale = kwargs.get("yscale", None) - for ax in plot["facet"].axes.ravel(): - if xscale is not None: - try: - ax.set_xscale(xscale) - except ValueError: - pass - if yscale is not None: - try: - ax.set_yscale(yscale) - except ValueError: - pass + # if fnum >= 1: + # break + + # print("Adjust plots") + # for plot in plots: + # xscale = kwargs.get("xscale", None) + # yscale = kwargs.get("yscale", None) + # facet = plot["facet"] + + # facet_data_groups = dict(list(facet.data.groupby(facet._col_var))) + # facet_data_group_iter = iter(facet_data_groups.keys()) + + # for ax in facet.axes.ravel(): + + # if xscale is not None: + # try: + # ax.set_xscale(xscale) + # except ValueError: + # pass + # if yscale is not None: + # try: + # ax.set_yscale(yscale) + # except ValueError: + # pass print("Finish") return plots diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py index 69b38c0..a9c6d2d 100644 --- a/json_benchmarks/core.py +++ b/json_benchmarks/core.py @@ -204,8 +204,11 @@ def analyze_results(result_fpaths): analysis.analysis() table = analysis.table + stats_table = util_stats.aggregate_stats( + table, suffix="_time", group_keys=["name"] + ) - single_size = table[(table["size"] == 256) | table["size"].isnull()] + single_size = stats_table[(stats_table["size"] == 256) | stats_table["size"].isnull()] # single_size_combo = aggregate_stats(single_size, None) single_size_combo = util_stats.aggregate_stats( single_size, suffix="_time", group_keys=["name"] @@ -248,12 +251,14 @@ def analyze_results(result_fpaths): kwplot.autosns() self = analysis + data = stats_table plots = analysis.plot( xlabel, metric_key, group_labels, xscale="log", yscale="log", + data=data, ) plots kwplot.show_if_requested() From eee2a5ff66e0d0950a7d176c1407b60364bea100 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 29 May 2022 03:04:18 +0000 Subject: [PATCH 14/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../benchmarker/result_analysis.py | 33 ++++++++++++------- json_benchmarks/core.py | 8 ++--- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/json_benchmarks/benchmarker/result_analysis.py b/json_benchmarks/benchmarker/result_analysis.py index 9f1730a..2702ecd 100644 --- a/json_benchmarks/benchmarker/result_analysis.py +++ b/json_benchmarks/benchmarker/result_analysis.py @@ -915,29 +915,38 @@ class ResultAnalysis(ub.NiceRepr): # facet_data_group_iter = iter(facet_data_groups.keys()) for ax in facet.axes.ravel(): - col_key = ax.get_title().split('=', 1)[-1].strip() + col_key = ax.get_title().split("=", 1)[-1].strip() # col_key = next(facet_data_group_iter) col_data = facet_data_groups[col_key] - col_data['mean_time'] - col_data['std_time'] - xlabel = plot_kws['x'] - ylabel = plot_kws['y'] - subgroups = col_data.groupby(plot_kws['hue']) + col_data["mean_time"] + col_data["std_time"] + xlabel = plot_kws["x"] + ylabel = plot_kws["y"] + subgroups = col_data.groupby(plot_kws["hue"]) for subgroup_key, subgroup in subgroups: # combine stds in multiple groups on the x and manually draw errors - suffix = '_' + ylabel.partition('_')[2] - if 'mean_' in ylabel: - std_label = ylabel.replace('mean_', 'std_') - combo_group = aggregate_stats(subgroup, suffix=suffix, group_keys=[plot_kws['x']]) + suffix = "_" + ylabel.partition("_")[2] + if "mean_" in ylabel: + std_label = ylabel.replace("mean_", "std_") + combo_group = aggregate_stats( + subgroup, suffix=suffix, group_keys=[plot_kws["x"]] + ) _xdata = combo_group[xlabel].values _ydata_mean = combo_group[ylabel].values _ydata_std = combo_group[std_label].values - std_label = ylabel.replace('mean_', 'std_') + std_label = ylabel.replace("mean_", "std_") y_data_min = _ydata_mean - _ydata_std y_data_max = _ydata_mean + _ydata_std spread_alpha = 0.3 color = palette[subgroup_key] - ax.fill_between(_xdata, y_data_min, y_data_max, alpha=spread_alpha, color=color, zorder=1) + ax.fill_between( + _xdata, + y_data_min, + y_data_max, + alpha=spread_alpha, + color=color, + zorder=1, + ) # zorder=0) xscale = kwargs.get("xscale", None) diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py index a9c6d2d..0fa7969 100644 --- a/json_benchmarks/core.py +++ b/json_benchmarks/core.py @@ -204,11 +204,11 @@ def analyze_results(result_fpaths): analysis.analysis() table = analysis.table - stats_table = util_stats.aggregate_stats( - table, suffix="_time", group_keys=["name"] - ) + stats_table = util_stats.aggregate_stats(table, suffix="_time", group_keys=["name"]) - single_size = stats_table[(stats_table["size"] == 256) | stats_table["size"].isnull()] + single_size = stats_table[ + (stats_table["size"] == 256) | stats_table["size"].isnull() + ] # single_size_combo = aggregate_stats(single_size, None) single_size_combo = util_stats.aggregate_stats( single_size, suffix="_time", group_keys=["name"] From bd592fdd3bb4c9cf1706d9ab58ed74ea2ff7563d Mon Sep 17 00:00:00 2001 From: joncrall Date: Sun, 29 May 2022 13:26:59 -0400 Subject: [PATCH 15/25] Refactor core into measures and analysis submodules --- json_benchmarks/analysis.py | 112 +++++++++ json_benchmarks/benchmarker/__init__.py | 90 +++---- json_benchmarks/benchmarker/aggregate.py | 74 ------ json_benchmarks/benchmarker/benchmarker.py | 62 +++++ .../benchmarker/result_analysis.py | 8 +- json_benchmarks/core.py | 225 +----------------- json_benchmarks/libraries.py | 67 ++++++ json_benchmarks/measures.py | 126 ++++++++++ 8 files changed, 409 insertions(+), 355 deletions(-) create mode 100644 json_benchmarks/analysis.py delete mode 100644 json_benchmarks/benchmarker/aggregate.py create mode 100644 json_benchmarks/libraries.py create mode 100644 json_benchmarks/measures.py diff --git a/json_benchmarks/analysis.py b/json_benchmarks/analysis.py new file mode 100644 index 0000000..acda85e --- /dev/null +++ b/json_benchmarks/analysis.py @@ -0,0 +1,112 @@ +""" +The analysis of the measurements +""" +import scriptconfig as scfg +import ubelt as ub + + +class AnalysisConfig(scfg.Config): + default = { + "cache_dir": scfg.Value( + None, + help=ub.paragraph( + """ + Location for benchmark cache. + Defaults to $XDG_CACHE/ujson/benchmark_results/ + """ + ), + ), + } + + def normalize(self): + dpath = self["cache_dir"] + if dpath is None: + dpath = ub.Path.appdir("ujson/benchmark_results") + dpath = ub.Path(dpath) + self["cache_dir"] = dpath + + +def analyze_results(result_fpaths): + from json_benchmarks.benchmarker import util_stats + from json_benchmarks import benchmarker + import json + + results = [] + for fpath in ub.ProgIter(result_fpaths, desc="load results"): + data = json.loads(fpath.read_text()) + for row in data["rows"]: + result = benchmarker.BenchmarkerResult.load(fpath) + results.extend(result.to_result_list()) + + RECORD_ALL = 0 + metric_key = "time" if RECORD_ALL else "mean_time" + + # results = benchmark.result.to_result_list() + + analysis = benchmarker.result_analysis.ResultAnalysis( + results, + metrics=[metric_key], + params=["impl"], + metric_objectives={ + "min_time": "min", + "mean_time": "min", + "time": "min", + }, + ) + analysis.analysis() + + table = analysis.table + stats_table = util_stats.aggregate_stats(table, suffix="_time", group_keys=["name"]) + + single_size = stats_table[ + (stats_table["size"] == 256) | stats_table["size"].isnull() + ] + # single_size_combo = aggregate_stats(single_size, None) + single_size_combo = util_stats.aggregate_stats( + single_size, suffix="_time", group_keys=["name"] + ) + + param_group = ["impl", "impl_version"] + single_size_combo["calls/sec"] = 1 / single_size_combo["mean_time"] + # _single_size_combo = single_size_combo.copy() + time_piv = single_size_combo.pivot(["input", "func"], param_group, "mean_time") + + hz_piv = 1 / time_piv + # hzstr_piv = (1 / time_piv).applymap(lambda x: f"{x:,.02f}") + print("Table for size=256") + # print(hzstr_piv.to_markdown()) + print(hz_piv.to_markdown(floatfmt=",.02f")) + print("") + print("Above metrics are in call/sec, larger is better.") + + speedup_piv = hz_piv / hz_piv["json"].values + print(speedup_piv.to_markdown(floatfmt=",.02g")) + + analysis.abalate(param_group) + # benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL) + + xlabel = "size" + # Set these to empty lists if they are not used + group_labels = { + "fig": ["input"], + "col": ["func"], + # "fig": [], + # "col": ["func" "input"], + "hue": ["impl", "impl_version"], + "size": [], + } + import kwplot + kwplot.autosns() + self = analysis # NOQA + + data = stats_table + plots = analysis.plot( + xlabel, + metric_key, + group_labels, + xscale="log", + yscale="log", + data=data, + ) + plots + kwplot.show_if_requested() diff --git a/json_benchmarks/benchmarker/__init__.py b/json_benchmarks/benchmarker/__init__.py index a278879..aa42063 100644 --- a/json_benchmarks/benchmarker/__init__.py +++ b/json_benchmarks/benchmarker/__init__.py @@ -9,65 +9,33 @@ mkinit ~/code/ultrajson/json_benchmarks/benchmarker/__init__.py -w __version__ = "0.1.0" -from json_benchmarks.benchmarker import ( - aggregate, - benchmarker, - process_context, - result_analysis, - util_json, - util_stats, - visualize, -) -from json_benchmarks.benchmarker.aggregate import demo, demo_data -from json_benchmarks.benchmarker.benchmarker import ( - Benchmarker, - BenchmarkerConfig, - BenchmarkerResult, -) -from json_benchmarks.benchmarker.process_context import ProcessContext -from json_benchmarks.benchmarker.result_analysis import ( - DEFAULT_METRIC_TO_OBJECTIVE, - Result, - ResultAnalysis, - SkillTracker, -) -from json_benchmarks.benchmarker.util_json import ( - ensure_json_serializable, - find_json_unserializable, - indexable_allclose, -) -from json_benchmarks.benchmarker.util_stats import ( - aggregate_stats, - combine_stats, - combine_stats_arrs, - stats_dict, -) -from json_benchmarks.benchmarker.visualize import benchmark_analysis +from json_benchmarks.benchmarker import benchmarker +from json_benchmarks.benchmarker import process_context +from json_benchmarks.benchmarker import result_analysis +from json_benchmarks.benchmarker import util_json +from json_benchmarks.benchmarker import util_stats +from json_benchmarks.benchmarker import visualize -__all__ = [ - "Benchmarker", - "BenchmarkerConfig", - "BenchmarkerResult", - "DEFAULT_METRIC_TO_OBJECTIVE", - "ProcessContext", - "Result", - "ResultAnalysis", - "SkillTracker", - "aggregate", - "aggregate_stats", - "benchmark_analysis", - "benchmarker", - "combine_stats", - "combine_stats_arrs", - "demo", - "demo_data", - "ensure_json_serializable", - "find_json_unserializable", - "indexable_allclose", - "process_context", - "result_analysis", - "stats_dict", - "util_json", - "util_stats", - "visualize", -] +from json_benchmarks.benchmarker.benchmarker import (Benchmarker, + BenchmarkerConfig, + BenchmarkerResult,) +from json_benchmarks.benchmarker.process_context import (ProcessContext,) +from json_benchmarks.benchmarker.result_analysis import ( + DEFAULT_METRIC_TO_OBJECTIVE, Result, ResultAnalysis, SkillTracker,) +from json_benchmarks.benchmarker.util_json import (ensure_json_serializable, + find_json_unserializable, + indexable_allclose,) +from json_benchmarks.benchmarker.util_stats import (aggregate_stats, + combine_stats, + combine_stats_arrs, + stats_dict,) +from json_benchmarks.benchmarker.visualize import (benchmark_analysis,) + +__all__ = ['Benchmarker', 'BenchmarkerConfig', 'BenchmarkerResult', + 'DEFAULT_METRIC_TO_OBJECTIVE', 'ProcessContext', 'Result', + 'ResultAnalysis', 'SkillTracker', 'aggregate_stats', + 'benchmark_analysis', 'benchmarker', 'combine_stats', + 'combine_stats_arrs', 'ensure_json_serializable', + 'find_json_unserializable', 'indexable_allclose', 'process_context', + 'result_analysis', 'stats_dict', 'util_json', 'util_stats', + 'visualize'] diff --git a/json_benchmarks/benchmarker/aggregate.py b/json_benchmarks/benchmarker/aggregate.py deleted file mode 100644 index bba5771..0000000 --- a/json_benchmarks/benchmarker/aggregate.py +++ /dev/null @@ -1,74 +0,0 @@ -import json - -import pandas as pd -import ubelt as ub - - -def demo_data(): - import numpy as np - - from json_benchmarks.benchmarker.benchmarker import Benchmarker - - impl_lut = { - "numpy": np.sum, - "builtin": sum, - } - - def data_lut(params): - item = 42 if params["dtype"] == "int" else 42.0 - data = [item] * params["size"] - return data - - basis = { - "impl": ["builtin", "numpy"], - "size": [10, 10000], - "dtype": ["int", "float"], - } - - dpath = ub.Path.appdir("benchmarker/agg_demo").delete().ensuredir() - - def run_one_benchmark(): - self = Benchmarker(name="agg_demo", num=10, bestof=3, basis=basis) - for params in self.iter_params(): - impl = impl_lut[params["impl"]] - data = data_lut(params) - for timer in self.measure(): - with timer: - impl(data) - fpath = self.dump_in_dpath(dpath) - return fpath - - # Run the benchmark multiple times - fpaths = [] - for _ in range(5): - fpath = run_one_benchmark() - fpaths.append(fpath) - - return fpaths - - -def demo(): - from json_benchmarks.benchmarker import BenchmarkerResult, result_analysis - - fpaths = demo_data() - - results = [] - for fpath in fpaths: - data = json.loads(fpath.read_text()) - for row in data["rows"]: - result = BenchmarkerResult.load(fpath) - results.extend(result.to_result_list()) - - analysis = result_analysis.ResultAnalysis( - results, - metrics=["min", "mean"], - params=["impl"], - metric_objectives={ - "min": "min", - "mean": "min", - }, - ) - analysis.analysis() - # single_df = pd.DataFrame(data['rows']) - # context = data['context'] - # single_df diff --git a/json_benchmarks/benchmarker/benchmarker.py b/json_benchmarks/benchmarker/benchmarker.py index 7a0d4fa..ac53372 100644 --- a/json_benchmarks/benchmarker/benchmarker.py +++ b/json_benchmarks/benchmarker/benchmarker.py @@ -168,3 +168,65 @@ class Benchmarker: "name": key, } rows.append(row) + + +def _test_demo(): + from json_benchmarks.benchmarker import BenchmarkerResult, result_analysis + from json_benchmarks.benchmarker.benchmarker import Benchmarker + import numpy as np + + impl_lut = { + "numpy": np.sum, + "builtin": sum, + } + + def data_lut(params): + item = 42 if params["dtype"] == "int" else 42.0 + data = [item] * params["size"] + return data + + basis = { + "impl": ["builtin", "numpy"], + "size": [10, 10000], + "dtype": ["int", "float"], + } + + dpath = ub.Path.appdir("benchmarker/agg_demo").delete().ensuredir() + + def run_one_benchmark(): + self = Benchmarker(name="agg_demo", num=10, bestof=3, basis=basis) + for params in self.iter_params(): + impl = impl_lut[params["impl"]] + data = data_lut(params) + for timer in self.measure(): + with timer: + impl(data) + fpath = self.dump_in_dpath(dpath) + return fpath + + # Run the benchmark multiple times + fpaths = [] + for _ in range(5): + fpath = run_one_benchmark() + fpaths.append(fpath) + + results = [] + for fpath in fpaths: + data = json.loads(fpath.read_text()) + for row in data["rows"]: + result = BenchmarkerResult.load(fpath) + results.extend(result.to_result_list()) + + analysis = result_analysis.ResultAnalysis( + results, + metrics=["min", "mean"], + params=["impl"], + metric_objectives={ + "min": "min", + "mean": "min", + }, + ) + analysis.analysis() + # single_df = pd.DataFrame(data['rows']) + # context = data['context'] + # single_df diff --git a/json_benchmarks/benchmarker/result_analysis.py b/json_benchmarks/benchmarker/result_analysis.py index 2702ecd..108f3de 100644 --- a/json_benchmarks/benchmarker/result_analysis.py +++ b/json_benchmarks/benchmarker/result_analysis.py @@ -935,8 +935,12 @@ class ResultAnalysis(ub.NiceRepr): _ydata_mean = combo_group[ylabel].values _ydata_std = combo_group[std_label].values std_label = ylabel.replace("mean_", "std_") - y_data_min = _ydata_mean - _ydata_std - y_data_max = _ydata_mean + _ydata_std + + # Plot bars 3 standard deviations from the mean to + # get a 99.7% interval + num_std = 3 + y_data_min = _ydata_mean - num_std * _ydata_std + y_data_max = _ydata_mean + num_std * _ydata_std spread_alpha = 0.3 color = palette[subgroup_key] ax.fill_between( diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py index 0fa7969..d6103be 100644 --- a/json_benchmarks/core.py +++ b/json_benchmarks/core.py @@ -1,24 +1,14 @@ """ Main definition of the benchmarks """ -import json - import scriptconfig as scfg import ubelt as ub -from json_benchmarks import benchmarker, datagen -from json_benchmarks.benchmarker import util_stats - -KNOWN_LIBRARIES = [ - "ujson", - "nujson", - "orjson", - "simplejson", - "json", -] +from json_benchmarks import measures +from json_benchmarks import analysis -class JSONBenchmarkConfig(scfg.Config): +class CoreConfig(scfg.Config): """ Benchmark JSON implementations """ @@ -43,24 +33,7 @@ class JSONBenchmarkConfig(scfg.Config): """ ), ), - "disable": scfg.Value( - [], - choices=KNOWN_LIBRARIES, - help=ub.paragraph( - """ - Remove specified libraries from the benchmarks - """ - ), - ), - "factor": scfg.Value( - 1.0, - help=ub.paragraph( - """ - Specify as a fraction to speed up benchmarks for development / - testing - """ - ), - ), + "cache_dir": scfg.Value( None, help=ub.paragraph( @@ -80,190 +53,6 @@ class JSONBenchmarkConfig(scfg.Config): self["cache_dir"] = dpath -def available_json_impls(): - import importlib - - known_modnames = KNOWN_LIBRARIES - json_impls = {} - for libname in known_modnames: - try: - module = importlib.import_module(libname) - except ImportError: - pass - else: - json_impls[libname] = { - "module": module, - "version": module.__version__, - } - return json_impls - - -def benchmark_json(): - json_impls = available_json_impls() - - data_lut = datagen.json_test_data_generators() - - # These are the parameters that we benchmark over - common_basis = { - "impl": list(json_impls.keys()), - "func": ["dumps", "loads"], - } - sized_basis = { - "input": [ - "Array with doubles", - "Array with UTF-8 strings", - # 'Medium complex object', - "Array with True values", - "Array of Dict[str, int]", - # 'Dict of List[Dict[str, int]]', - # 'Complex object' - ], - "size": [1, 2, 4, 8, 16, 32, 128, 256, 512], - # 1024, 2048, 4096, 8192, 12288], - } - predefined_basis = { - "input": ["Complex object"], - "size": [None], - } - - basis = [ - ub.dict_union(common_basis, predefined_basis), - ub.dict_union(common_basis, sized_basis), - ] - - # The Benchmarker class is a new experimental API around timerit to - # abstract away the details of timing a process over a grid of parameters, - # serializing the results, and aggregating results from disparate runs. - benchmark = benchmarker.Benchmarker( - name="bench_json", - num=1000, - bestof=100, - verbose=3, - basis=basis, - ) - - def is_blocked(params): - if params["input"] == "Complex object" and params["impl"] == "orjson": - return True - - # For each variation of your experiment, create a row. - for params in benchmark.iter_params(): - if is_blocked(params): - continue - # Make any modifications you need to compute input kwargs for each - # method here. - impl_info = json_impls[params["impl"]] - params["impl_version"] = impl_info["version"] - module = impl_info["module"] - if params["func"] == "dumps": - method = module.dumps - data = data_lut[params["input"]](params["size"]) - elif params["func"] == "loads": - method = module.loads - to_encode = data_lut[params["input"]](params["size"]) - data = json.dumps(to_encode) - # Timerit will run some user-specified number of loops. - # and compute time stats with similar methodology to timeit - for timer in benchmark.measure(): - # Put any setup logic you dont want to time here. - # ... - with timer: - # Put the logic you want to time here - method(data) - - dpath = ub.Path.appdir("ujson/benchmark_results").ensuredir() - result_fpath = benchmark.dump_in_dpath(dpath) - return result_fpath - - -def analyze_results(result_fpaths): - import json - - results = [] - for fpath in ub.ProgIter(result_fpaths, desc="load results"): - data = json.loads(fpath.read_text()) - for row in data["rows"]: - result = benchmarker.BenchmarkerResult.load(fpath) - results.extend(result.to_result_list()) - - RECORD_ALL = 0 - metric_key = "time" if RECORD_ALL else "mean_time" - - # results = benchmark.result.to_result_list() - - analysis = benchmarker.result_analysis.ResultAnalysis( - results, - metrics=[metric_key], - params=["impl"], - metric_objectives={ - "min_time": "min", - "mean_time": "min", - "time": "min", - }, - ) - analysis.analysis() - - table = analysis.table - stats_table = util_stats.aggregate_stats(table, suffix="_time", group_keys=["name"]) - - single_size = stats_table[ - (stats_table["size"] == 256) | stats_table["size"].isnull() - ] - # single_size_combo = aggregate_stats(single_size, None) - single_size_combo = util_stats.aggregate_stats( - single_size, suffix="_time", group_keys=["name"] - ) - - param_group = ["impl", "impl_version"] - single_size_combo["calls/sec"] = 1 / single_size_combo["mean_time"] - # _single_size_combo = single_size_combo.copy() - # _single_size_combo["calls/sec"] = _single_size_combo["calls/sec"].apply( - # - # ) - time_piv = single_size_combo.pivot(["input", "func"], param_group, "mean_time") - - hz_piv = 1 / time_piv - # hzstr_piv = (1 / time_piv).applymap(lambda x: f"{x:,.02f}") - print("Table for size=256") - # print(hzstr_piv.to_markdown()) - print(hz_piv.to_markdown(floatfmt=",.02f")) - print("") - print("Above metrics are in call/sec, larger is better.") - - speedup_piv = hz_piv / hz_piv["json"].values - print(speedup_piv.to_markdown(floatfmt=",.02g")) - - analysis.abalate(param_group) - # benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL) - - xlabel = "size" - # Set these to empty lists if they are not used - group_labels = { - "fig": ["input"], - "col": ["func"], - # "fig": [], - # "col": ["func" "input"], - "hue": ["impl", "impl_version"], - "size": [], - } - import kwplot - - kwplot.autosns() - self = analysis - - data = stats_table - plots = analysis.plot( - xlabel, - metric_key, - group_labels, - xscale="log", - yscale="log", - data=data, - ) - plots - kwplot.show_if_requested() - - def main(cmdline=True, **kwargs): """ Example: @@ -276,12 +65,12 @@ def main(cmdline=True, **kwargs): >>> kwargs = {} >>> main(cmdline, **kwargs) """ - config = JSONBenchmarkConfig(cmdline=cmdline, data=kwargs) + config = CoreConfig(cmdline=cmdline, data=kwargs) dpath = config["cache_dir"] run = config["mode"] in {"all", "single", "run"} if run: - result_fpath = benchmark_json() + result_fpath = measures.benchmark_json() print(f"result_fpath = {result_fpath!r}") result_fpaths = [result_fpath] @@ -291,4 +80,4 @@ def main(cmdline=True, **kwargs): analyze = config["mode"] in {"all", "single", "analyze"} if analyze: - analyze_results(result_fpaths) + analysis.analyze_results(result_fpaths) diff --git a/json_benchmarks/libraries.py b/json_benchmarks/libraries.py new file mode 100644 index 0000000..d7a8550 --- /dev/null +++ b/json_benchmarks/libraries.py @@ -0,0 +1,67 @@ +""" +Define the json libraries we are considering +""" + +KNOWN_LIBRARIES = [ + {'modname': "ujson", 'distname': 'ujson'}, + {'modname': "nujson", 'distname': 'nujson'}, + {'modname': "orjson", 'distname': 'orjson'}, + {'modname': "simplejson", 'distname': 'simplejson'}, + {'modname': "json", 'distname': ""}, + {'modname': "simdjson", 'distname': 'pysimdjson'}, +] + +KNOWN_MODNAMES = [info['modname'] for info in KNOWN_LIBRARIES] + + +# TODO: +# def distname_to_modnames(distname): +# # TODO: nice way to switch between a module's import name and it's distribution name +# # References: +# # https://stackoverflow.com/questions/49764802/get-module-name-programmatically-with-only-pypi-package-name/49764960#49764960 +# import distlib.database +# distlib.database.DistributionPath().get_distribution(distname) +# # import importlib.metadata +# # importlib.metadata.metadata(distname) +# # importlib.util.find_spec(modname) +# # import simdjson +# # import pkg_resources +# # pkg_resources.get_distribution('pysimdjson') + + +def available_json_impls(): + """ + Return a dictionary of information about each json implementation + + Example: + >>> from json_benchmarks.libraries import * # NOQA + >>> json_impls = available_json_impls() + >>> print('json_impls = {}'.format(ub.repr2(json_impls, nl=1))) + """ + import importlib + known_libinfo = KNOWN_LIBRARIES + json_impls = {} + for libinfo in known_libinfo: + modname = libinfo['modname'] + distname = libinfo['distname'] + try: + module = importlib.import_module(modname) + except ImportError: + pass + else: + import pkg_resources + mod_version = getattr(module, '__version__', None) + if distname == '': + pkg_version = mod_version + else: + pkg_version = pkg_resources.get_distribution(distname).version + if mod_version is not None: + assert mod_version == pkg_version + version = pkg_version + json_impls[modname] = { + "module": module, + "modname": modname, + "distname": distname, + "version": version, + } + return json_impls diff --git a/json_benchmarks/measures.py b/json_benchmarks/measures.py new file mode 100644 index 0000000..1878c62 --- /dev/null +++ b/json_benchmarks/measures.py @@ -0,0 +1,126 @@ +""" +The definitions of the measurements we want to take +""" +import scriptconfig as scfg +import ubelt as ub +import json +from json_benchmarks import libraries + + +class MeasurementConfig(scfg.Config): + default = { + "disable": scfg.Value( + [], + choices=libraries.KNOWN_MODNAMES, + help=ub.paragraph( + """ + Remove specified libraries from the benchmarks + """ + ), + ), + "factor": scfg.Value( + 1.0, + help=ub.paragraph( + """ + Specify as a fraction to speed up benchmarks for development / + testing + """ + ), + ), + "cache_dir": scfg.Value( + None, + help=ub.paragraph( + """ + Location for benchmark cache. + Defaults to $XDG_CACHE/ujson/benchmark_results/ + """ + ), + ), + } + + def normalize(self): + dpath = self["cache_dir"] + if dpath is None: + dpath = ub.Path.appdir("ujson/benchmark_results") + dpath = ub.Path(dpath) + self["cache_dir"] = dpath + + +def benchmark_json(): + from json_benchmarks import benchmarker + from json_benchmarks import datagen + from json_benchmarks import libraries + + json_impls = libraries.available_json_impls() + data_lut = datagen.json_test_data_generators() + + # These are the parameters that we benchmark over + common_basis = { + "impl": list(json_impls.keys()), + "func": ["dumps", "loads"], + } + sized_basis = { + "input": [ + "Array with doubles", + "Array with UTF-8 strings", + # 'Medium complex object', + "Array with True values", + "Array of Dict[str, int]", + # 'Dict of List[Dict[str, int]]', + # 'Complex object' + ], + "size": [1, 2, 4, 8, 16, 32, 128, 256, 512, 1024, 2048, 4096, 8192, 12288], + } + predefined_basis = { + "input": ["Complex object"], + "size": [None], + } + + basis = [ + ub.dict_union(common_basis, predefined_basis), + ub.dict_union(common_basis, sized_basis), + ] + + # The Benchmarker class is a new experimental API around timerit to + # abstract away the details of timing a process over a grid of parameters, + # serializing the results, and aggregating results from disparate runs. + benchmark = benchmarker.Benchmarker( + name="bench_json", + num=1000, + bestof=100, + verbose=3, + basis=basis, + ) + + def is_blocked(params): + if params["input"] == "Complex object" and params["impl"] == "orjson": + return True + + # For each variation of your experiment, create a row. + for params in benchmark.iter_params(): + if is_blocked(params): + continue + # Make any modifications you need to compute input kwargs for each + # method here. + impl_info = json_impls[params["impl"]] + params["impl_version"] = impl_info["version"] + module = impl_info["module"] + if params["func"] == "dumps": + method = module.dumps + data = data_lut[params["input"]](params["size"]) + elif params["func"] == "loads": + method = module.loads + to_encode = data_lut[params["input"]](params["size"]) + data = json.dumps(to_encode) + # Timerit will run some user-specified number of loops. + # and compute time stats with similar methodology to timeit + for timer in benchmark.measure(): + # Put any setup logic you dont want to time here. + # ... + with timer: + # Put the logic you want to time here + method(data) + + dpath = ub.Path.appdir("ujson/benchmark_results").ensuredir() + result_fpath = benchmark.dump_in_dpath(dpath) + return result_fpath From 2b2aedb89f1750d4f3901a07dd212927a318b32d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 29 May 2022 17:27:13 +0000 Subject: [PATCH 16/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- json_benchmarks/analysis.py | 6 +- json_benchmarks/benchmarker/__init__.py | 83 ++++++++++++++-------- json_benchmarks/benchmarker/benchmarker.py | 3 +- json_benchmarks/core.py | 4 +- json_benchmarks/libraries.py | 24 ++++--- json_benchmarks/measures.py | 8 +-- 6 files changed, 79 insertions(+), 49 deletions(-) diff --git a/json_benchmarks/analysis.py b/json_benchmarks/analysis.py index acda85e..2c9e923 100644 --- a/json_benchmarks/analysis.py +++ b/json_benchmarks/analysis.py @@ -27,10 +27,11 @@ class AnalysisConfig(scfg.Config): def analyze_results(result_fpaths): - from json_benchmarks.benchmarker import util_stats - from json_benchmarks import benchmarker import json + from json_benchmarks import benchmarker + from json_benchmarks.benchmarker import util_stats + results = [] for fpath in ub.ProgIter(result_fpaths, desc="load results"): data = json.loads(fpath.read_text()) @@ -96,6 +97,7 @@ def analyze_results(result_fpaths): "size": [], } import kwplot + kwplot.autosns() self = analysis # NOQA diff --git a/json_benchmarks/benchmarker/__init__.py b/json_benchmarks/benchmarker/__init__.py index aa42063..27d133a 100644 --- a/json_benchmarks/benchmarker/__init__.py +++ b/json_benchmarks/benchmarker/__init__.py @@ -9,33 +9,60 @@ mkinit ~/code/ultrajson/json_benchmarks/benchmarker/__init__.py -w __version__ = "0.1.0" -from json_benchmarks.benchmarker import benchmarker -from json_benchmarks.benchmarker import process_context -from json_benchmarks.benchmarker import result_analysis -from json_benchmarks.benchmarker import util_json -from json_benchmarks.benchmarker import util_stats -from json_benchmarks.benchmarker import visualize - -from json_benchmarks.benchmarker.benchmarker import (Benchmarker, - BenchmarkerConfig, - BenchmarkerResult,) -from json_benchmarks.benchmarker.process_context import (ProcessContext,) +from json_benchmarks.benchmarker import ( + benchmarker, + process_context, + result_analysis, + util_json, + util_stats, + visualize, +) +from json_benchmarks.benchmarker.benchmarker import ( + Benchmarker, + BenchmarkerConfig, + BenchmarkerResult, +) +from json_benchmarks.benchmarker.process_context import ProcessContext from json_benchmarks.benchmarker.result_analysis import ( - DEFAULT_METRIC_TO_OBJECTIVE, Result, ResultAnalysis, SkillTracker,) -from json_benchmarks.benchmarker.util_json import (ensure_json_serializable, - find_json_unserializable, - indexable_allclose,) -from json_benchmarks.benchmarker.util_stats import (aggregate_stats, - combine_stats, - combine_stats_arrs, - stats_dict,) -from json_benchmarks.benchmarker.visualize import (benchmark_analysis,) + DEFAULT_METRIC_TO_OBJECTIVE, + Result, + ResultAnalysis, + SkillTracker, +) +from json_benchmarks.benchmarker.util_json import ( + ensure_json_serializable, + find_json_unserializable, + indexable_allclose, +) +from json_benchmarks.benchmarker.util_stats import ( + aggregate_stats, + combine_stats, + combine_stats_arrs, + stats_dict, +) +from json_benchmarks.benchmarker.visualize import benchmark_analysis -__all__ = ['Benchmarker', 'BenchmarkerConfig', 'BenchmarkerResult', - 'DEFAULT_METRIC_TO_OBJECTIVE', 'ProcessContext', 'Result', - 'ResultAnalysis', 'SkillTracker', 'aggregate_stats', - 'benchmark_analysis', 'benchmarker', 'combine_stats', - 'combine_stats_arrs', 'ensure_json_serializable', - 'find_json_unserializable', 'indexable_allclose', 'process_context', - 'result_analysis', 'stats_dict', 'util_json', 'util_stats', - 'visualize'] +__all__ = [ + "Benchmarker", + "BenchmarkerConfig", + "BenchmarkerResult", + "DEFAULT_METRIC_TO_OBJECTIVE", + "ProcessContext", + "Result", + "ResultAnalysis", + "SkillTracker", + "aggregate_stats", + "benchmark_analysis", + "benchmarker", + "combine_stats", + "combine_stats_arrs", + "ensure_json_serializable", + "find_json_unserializable", + "indexable_allclose", + "process_context", + "result_analysis", + "stats_dict", + "util_json", + "util_stats", + "visualize", +] diff --git a/json_benchmarks/benchmarker/benchmarker.py b/json_benchmarks/benchmarker/benchmarker.py index ac53372..21e6234 100644 --- a/json_benchmarks/benchmarker/benchmarker.py +++ b/json_benchmarks/benchmarker/benchmarker.py @@ -171,9 +171,10 @@ class Benchmarker: def _test_demo(): + import numpy as np + from json_benchmarks.benchmarker import BenchmarkerResult, result_analysis from json_benchmarks.benchmarker.benchmarker import Benchmarker - import numpy as np impl_lut = { "numpy": np.sum, diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py index d6103be..c43a474 100644 --- a/json_benchmarks/core.py +++ b/json_benchmarks/core.py @@ -4,8 +4,7 @@ Main definition of the benchmarks import scriptconfig as scfg import ubelt as ub -from json_benchmarks import measures -from json_benchmarks import analysis +from json_benchmarks import analysis, measures class CoreConfig(scfg.Config): @@ -33,7 +32,6 @@ class CoreConfig(scfg.Config): """ ), ), - "cache_dir": scfg.Value( None, help=ub.paragraph( diff --git a/json_benchmarks/libraries.py b/json_benchmarks/libraries.py index d7a8550..088368c 100644 --- a/json_benchmarks/libraries.py +++ b/json_benchmarks/libraries.py @@ -3,15 +3,15 @@ Define the json libraries we are considering """ KNOWN_LIBRARIES = [ - {'modname': "ujson", 'distname': 'ujson'}, - {'modname': "nujson", 'distname': 'nujson'}, - {'modname': "orjson", 'distname': 'orjson'}, - {'modname': "simplejson", 'distname': 'simplejson'}, - {'modname': "json", 'distname': ""}, - {'modname': "simdjson", 'distname': 'pysimdjson'}, + {"modname": "ujson", "distname": "ujson"}, + {"modname": "nujson", "distname": "nujson"}, + {"modname": "orjson", "distname": "orjson"}, + {"modname": "simplejson", "distname": "simplejson"}, + {"modname": "json", "distname": ""}, + {"modname": "simdjson", "distname": "pysimdjson"}, ] -KNOWN_MODNAMES = [info['modname'] for info in KNOWN_LIBRARIES] +KNOWN_MODNAMES = [info["modname"] for info in KNOWN_LIBRARIES] # TODO: @@ -39,19 +39,21 @@ def available_json_impls(): >>> print('json_impls = {}'.format(ub.repr2(json_impls, nl=1))) """ import importlib + known_libinfo = KNOWN_LIBRARIES json_impls = {} for libinfo in known_libinfo: - modname = libinfo['modname'] - distname = libinfo['distname'] + modname = libinfo["modname"] + distname = libinfo["distname"] try: module = importlib.import_module(modname) except ImportError: pass else: import pkg_resources - mod_version = getattr(module, '__version__', None) - if distname == '': + + mod_version = getattr(module, "__version__", None) + if distname == "": pkg_version = mod_version else: pkg_version = pkg_resources.get_distribution(distname).version diff --git a/json_benchmarks/measures.py b/json_benchmarks/measures.py index 1878c62..8e768ce 100644 --- a/json_benchmarks/measures.py +++ b/json_benchmarks/measures.py @@ -1,9 +1,11 @@ """ The definitions of the measurements we want to take """ +import json + import scriptconfig as scfg import ubelt as ub -import json + from json_benchmarks import libraries @@ -47,9 +49,7 @@ class MeasurementConfig(scfg.Config): def benchmark_json(): - from json_benchmarks import benchmarker - from json_benchmarks import datagen - from json_benchmarks import libraries + from json_benchmarks import benchmarker, datagen, libraries json_impls = libraries.available_json_impls() data_lut = datagen.json_test_data_generators() From b0bc25ab3c02d7f652d8e06abd253dee0ec1266f Mon Sep 17 00:00:00 2001 From: joncrall Date: Sun, 29 May 2022 18:56:24 -0400 Subject: [PATCH 17/25] Add simd libraries --- json_benchmarks/core.py | 6 +++--- json_benchmarks/libraries.py | 36 ++++++++++++++++++++++++++++++++---- json_benchmarks/measures.py | 21 ++++++++++++--------- 3 files changed, 47 insertions(+), 16 deletions(-) diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py index c43a474..af11174 100644 --- a/json_benchmarks/core.py +++ b/json_benchmarks/core.py @@ -72,9 +72,9 @@ def main(cmdline=True, **kwargs): print(f"result_fpath = {result_fpath!r}") result_fpaths = [result_fpath] - agg = config["mode"] not in {"single"} - if agg: - result_fpaths = list(dpath.glob("benchmarks*.json")) + # agg = config["mode"] not in {"single"} + # if agg: + # result_fpaths = list(dpath.glob("benchmarks*.json")) analyze = config["mode"] in {"all", "single", "analyze"} if analyze: diff --git a/json_benchmarks/libraries.py b/json_benchmarks/libraries.py index 088368c..f828e3f 100644 --- a/json_benchmarks/libraries.py +++ b/json_benchmarks/libraries.py @@ -9,6 +9,8 @@ KNOWN_LIBRARIES = [ {"modname": "simplejson", "distname": "simplejson"}, {"modname": "json", "distname": ""}, {"modname": "simdjson", "distname": "pysimdjson"}, + {"modname": "cysimdjson", "distname": "cysimdjson"}, + {"modname": "libpy_simdjson", "distname": "libpy-simdjson"}, ] KNOWN_MODNAMES = [info["modname"] for info in KNOWN_LIBRARIES] @@ -29,6 +31,29 @@ KNOWN_MODNAMES = [info["modname"] for info in KNOWN_LIBRARIES] # # pkg_resources.get_distribution('pysimdjson') +class Compatability: + """ + Expose a common API for all tested implmentations + """ + + @staticmethod + def lut_dumps(module): + if module.__name__ == 'cysimdjson': + return None + elif module.__name__ == 'pysimdjson': + return None + else: + return getattr(module, 'dumps', None) + + @staticmethod + def lut_loads(module): + if module.__name__ == 'cysimdjson': + parser = module.JSONParser() + return parser.loads + else: + return getattr(module, 'loads', None) + + def available_json_impls(): """ Return a dictionary of information about each json implementation @@ -39,7 +64,7 @@ def available_json_impls(): >>> print('json_impls = {}'.format(ub.repr2(json_impls, nl=1))) """ import importlib - + import pkg_resources known_libinfo = KNOWN_LIBRARIES json_impls = {} for libinfo in known_libinfo: @@ -50,8 +75,6 @@ def available_json_impls(): except ImportError: pass else: - import pkg_resources - mod_version = getattr(module, "__version__", None) if distname == "": pkg_version = mod_version @@ -60,10 +83,15 @@ def available_json_impls(): if mod_version is not None: assert mod_version == pkg_version version = pkg_version - json_impls[modname] = { + dumps = Compatability.lut_dumps(module) + loads = Compatability.lut_loads(module) + impl_info = { "module": module, "modname": modname, "distname": distname, "version": version, + "dumps": dumps, + "loads": loads, } + json_impls[modname] = impl_info return json_impls diff --git a/json_benchmarks/measures.py b/json_benchmarks/measures.py index 8e768ce..ca209b1 100644 --- a/json_benchmarks/measures.py +++ b/json_benchmarks/measures.py @@ -69,7 +69,7 @@ def benchmark_json(): # 'Dict of List[Dict[str, int]]', # 'Complex object' ], - "size": [1, 2, 4, 8, 16, 32, 128, 256, 512, 1024, 2048, 4096, 8192, 12288], + "size": [1, 2, 4, 8, 16, 32, 128, 256, 512, 1024, 2048, 4096, 8192], } predefined_basis = { "input": ["Complex object"], @@ -93,8 +93,10 @@ def benchmark_json(): ) def is_blocked(params): - if params["input"] == "Complex object" and params["impl"] == "orjson": - return True + if params["input"] == "Complex object": + # Some libraries can't handle the complex object + if params["impl"] in {"orjson", "libpy_simdjson"}: + return True # For each variation of your experiment, create a row. for params in benchmark.iter_params(): @@ -104,14 +106,15 @@ def benchmark_json(): # method here. impl_info = json_impls[params["impl"]] params["impl_version"] = impl_info["version"] - module = impl_info["module"] + method = impl_info[params["func"]] + if method is None: + # Not all libraries implement all methods + continue + py_data = data_lut[params["input"]](params["size"]) if params["func"] == "dumps": - method = module.dumps - data = data_lut[params["input"]](params["size"]) + data = py_data elif params["func"] == "loads": - method = module.loads - to_encode = data_lut[params["input"]](params["size"]) - data = json.dumps(to_encode) + data = json.dumps(py_data) # Timerit will run some user-specified number of loops. # and compute time stats with similar methodology to timeit for timer in benchmark.measure(): From 80d096015e35a747e9756c42ae16fb324fbb543f Mon Sep 17 00:00:00 2001 From: joncrall Date: Sun, 29 May 2022 19:07:46 -0400 Subject: [PATCH 18/25] Fix cysimdjson --- json_benchmarks/analysis.py | 1 - json_benchmarks/core.py | 7 ++++--- json_benchmarks/measures.py | 19 +++++++++++-------- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/json_benchmarks/analysis.py b/json_benchmarks/analysis.py index 2c9e923..35cb499 100644 --- a/json_benchmarks/analysis.py +++ b/json_benchmarks/analysis.py @@ -28,7 +28,6 @@ class AnalysisConfig(scfg.Config): def analyze_results(result_fpaths): import json - from json_benchmarks import benchmarker from json_benchmarks.benchmarker import util_stats diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py index af11174..b840c69 100644 --- a/json_benchmarks/core.py +++ b/json_benchmarks/core.py @@ -65,6 +65,7 @@ def main(cmdline=True, **kwargs): """ config = CoreConfig(cmdline=cmdline, data=kwargs) dpath = config["cache_dir"] + print(f'dpath={dpath}') run = config["mode"] in {"all", "single", "run"} if run: @@ -72,9 +73,9 @@ def main(cmdline=True, **kwargs): print(f"result_fpath = {result_fpath!r}") result_fpaths = [result_fpath] - # agg = config["mode"] not in {"single"} - # if agg: - # result_fpaths = list(dpath.glob("benchmarks*.json")) + agg = config["mode"] not in {"single"} + if agg: + result_fpaths = list(dpath.glob("benchmarks*.json")) analyze = config["mode"] in {"all", "single", "analyze"} if analyze: diff --git a/json_benchmarks/measures.py b/json_benchmarks/measures.py index ca209b1..7913cc6 100644 --- a/json_benchmarks/measures.py +++ b/json_benchmarks/measures.py @@ -86,8 +86,8 @@ def benchmark_json(): # serializing the results, and aggregating results from disparate runs. benchmark = benchmarker.Benchmarker( name="bench_json", - num=1000, - bestof=100, + num=100, + bestof=10, verbose=3, basis=basis, ) @@ -117,12 +117,15 @@ def benchmark_json(): data = json.dumps(py_data) # Timerit will run some user-specified number of loops. # and compute time stats with similar methodology to timeit - for timer in benchmark.measure(): - # Put any setup logic you dont want to time here. - # ... - with timer: - # Put the logic you want to time here - method(data) + try: + for timer in benchmark.measure(): + # Put any setup logic you dont want to time here. + # ... + with timer: + # Put the logic you want to time here + method(data) + except Exception as ex: + print(f'Failed to time: ex={ex}. Skipping') dpath = ub.Path.appdir("ujson/benchmark_results").ensuredir() result_fpath = benchmark.dump_in_dpath(dpath) From 7dbb203450810217c6f8790c01fb54d73d5305b7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 29 May 2022 23:08:02 +0000 Subject: [PATCH 19/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- json_benchmarks/analysis.py | 1 + json_benchmarks/core.py | 2 +- json_benchmarks/libraries.py | 12 +++++++----- json_benchmarks/measures.py | 2 +- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/json_benchmarks/analysis.py b/json_benchmarks/analysis.py index 35cb499..2c9e923 100644 --- a/json_benchmarks/analysis.py +++ b/json_benchmarks/analysis.py @@ -28,6 +28,7 @@ class AnalysisConfig(scfg.Config): def analyze_results(result_fpaths): import json + from json_benchmarks import benchmarker from json_benchmarks.benchmarker import util_stats diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py index b840c69..9b760c8 100644 --- a/json_benchmarks/core.py +++ b/json_benchmarks/core.py @@ -65,7 +65,7 @@ def main(cmdline=True, **kwargs): """ config = CoreConfig(cmdline=cmdline, data=kwargs) dpath = config["cache_dir"] - print(f'dpath={dpath}') + print(f"dpath={dpath}") run = config["mode"] in {"all", "single", "run"} if run: diff --git a/json_benchmarks/libraries.py b/json_benchmarks/libraries.py index f828e3f..027f7e9 100644 --- a/json_benchmarks/libraries.py +++ b/json_benchmarks/libraries.py @@ -38,20 +38,20 @@ class Compatability: @staticmethod def lut_dumps(module): - if module.__name__ == 'cysimdjson': + if module.__name__ == "cysimdjson": return None - elif module.__name__ == 'pysimdjson': + elif module.__name__ == "pysimdjson": return None else: - return getattr(module, 'dumps', None) + return getattr(module, "dumps", None) @staticmethod def lut_loads(module): - if module.__name__ == 'cysimdjson': + if module.__name__ == "cysimdjson": parser = module.JSONParser() return parser.loads else: - return getattr(module, 'loads', None) + return getattr(module, "loads", None) def available_json_impls(): @@ -64,7 +64,9 @@ def available_json_impls(): >>> print('json_impls = {}'.format(ub.repr2(json_impls, nl=1))) """ import importlib + import pkg_resources + known_libinfo = KNOWN_LIBRARIES json_impls = {} for libinfo in known_libinfo: diff --git a/json_benchmarks/measures.py b/json_benchmarks/measures.py index 7913cc6..c44a461 100644 --- a/json_benchmarks/measures.py +++ b/json_benchmarks/measures.py @@ -125,7 +125,7 @@ def benchmark_json(): # Put the logic you want to time here method(data) except Exception as ex: - print(f'Failed to time: ex={ex}. Skipping') + print(f"Failed to time: ex={ex}. Skipping") dpath = ub.Path.appdir("ujson/benchmark_results").ensuredir() result_fpath = benchmark.dump_in_dpath(dpath) From 9358a546e1f1668df88bb394e87ffcf6e1de36cc Mon Sep 17 00:00:00 2001 From: joncrall Date: Sun, 29 May 2022 19:08:37 -0400 Subject: [PATCH 20/25] name fix --- json_benchmarks/libraries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/json_benchmarks/libraries.py b/json_benchmarks/libraries.py index f828e3f..bcf60d7 100644 --- a/json_benchmarks/libraries.py +++ b/json_benchmarks/libraries.py @@ -40,7 +40,7 @@ class Compatability: def lut_dumps(module): if module.__name__ == 'cysimdjson': return None - elif module.__name__ == 'pysimdjson': + elif module.__name__ == 'simdjson': return None else: return getattr(module, 'dumps', None) From 9196d05d0b508691972c069df04a8379a8addf1a Mon Sep 17 00:00:00 2001 From: joncrall Date: Sun, 29 May 2022 19:10:51 -0400 Subject: [PATCH 21/25] wip --- json_benchmarks/libraries.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/json_benchmarks/libraries.py b/json_benchmarks/libraries.py index aff9625..ee4b74d 100644 --- a/json_benchmarks/libraries.py +++ b/json_benchmarks/libraries.py @@ -40,11 +40,7 @@ class Compatability: def lut_dumps(module): if module.__name__ == "cysimdjson": return None -<<<<<<< HEAD elif module.__name__ == 'simdjson': -======= - elif module.__name__ == "pysimdjson": ->>>>>>> 7dbb203450810217c6f8790c01fb54d73d5305b7 return None else: return getattr(module, "dumps", None) From 2f3070d74f9125dcbf1f1a1abac22809526b165f Mon Sep 17 00:00:00 2001 From: joncrall Date: Sun, 29 May 2022 19:11:03 -0400 Subject: [PATCH 22/25] wip --- json_benchmarks/libraries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/json_benchmarks/libraries.py b/json_benchmarks/libraries.py index ee4b74d..cb0efb7 100644 --- a/json_benchmarks/libraries.py +++ b/json_benchmarks/libraries.py @@ -40,7 +40,7 @@ class Compatability: def lut_dumps(module): if module.__name__ == "cysimdjson": return None - elif module.__name__ == 'simdjson': + elif module.__name__ == "simdjson": return None else: return getattr(module, "dumps", None) From ac5b1437120e18cf9fbc96476447d2951a6f6726 Mon Sep 17 00:00:00 2001 From: joncrall Date: Mon, 30 May 2022 21:29:51 -0400 Subject: [PATCH 23/25] stats for fix-encode-surrogates --- json_benchmarks/analysis.py | 6 +++--- json_benchmarks/libraries.py | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/json_benchmarks/analysis.py b/json_benchmarks/analysis.py index 2c9e923..d700e87 100644 --- a/json_benchmarks/analysis.py +++ b/json_benchmarks/analysis.py @@ -47,7 +47,7 @@ def analyze_results(result_fpaths): analysis = benchmarker.result_analysis.ResultAnalysis( results, metrics=[metric_key], - params=["impl"], + params=["impl", "impl_version"], metric_objectives={ "min_time": "min", "mean_time": "min", @@ -57,14 +57,14 @@ def analyze_results(result_fpaths): analysis.analysis() table = analysis.table - stats_table = util_stats.aggregate_stats(table, suffix="_time", group_keys=["name"]) + stats_table = util_stats.aggregate_stats(table, suffix="_time", group_keys=["name", "impl_version"]) single_size = stats_table[ (stats_table["size"] == 256) | stats_table["size"].isnull() ] # single_size_combo = aggregate_stats(single_size, None) single_size_combo = util_stats.aggregate_stats( - single_size, suffix="_time", group_keys=["name"] + single_size, suffix="_time", group_keys=["name", "impl_version"] ) param_group = ["impl", "impl_version"] diff --git a/json_benchmarks/libraries.py b/json_benchmarks/libraries.py index cb0efb7..4fe04a8 100644 --- a/json_benchmarks/libraries.py +++ b/json_benchmarks/libraries.py @@ -4,13 +4,13 @@ Define the json libraries we are considering KNOWN_LIBRARIES = [ {"modname": "ujson", "distname": "ujson"}, - {"modname": "nujson", "distname": "nujson"}, - {"modname": "orjson", "distname": "orjson"}, - {"modname": "simplejson", "distname": "simplejson"}, + # {"modname": "nujson", "distname": "nujson"}, + # {"modname": "orjson", "distname": "orjson"}, + # {"modname": "simplejson", "distname": "simplejson"}, {"modname": "json", "distname": ""}, - {"modname": "simdjson", "distname": "pysimdjson"}, - {"modname": "cysimdjson", "distname": "cysimdjson"}, - {"modname": "libpy_simdjson", "distname": "libpy-simdjson"}, + # {"modname": "simdjson", "distname": "pysimdjson"}, + # {"modname": "cysimdjson", "distname": "cysimdjson"}, + # {"modname": "libpy_simdjson", "distname": "libpy-simdjson"}, ] KNOWN_MODNAMES = [info["modname"] for info in KNOWN_LIBRARIES] From fd951a31f177ce863c9ceff357213183f340ecb4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 31 May 2022 01:30:42 +0000 Subject: [PATCH 24/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- json_benchmarks/analysis.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/json_benchmarks/analysis.py b/json_benchmarks/analysis.py index d700e87..b3f8778 100644 --- a/json_benchmarks/analysis.py +++ b/json_benchmarks/analysis.py @@ -57,7 +57,9 @@ def analyze_results(result_fpaths): analysis.analysis() table = analysis.table - stats_table = util_stats.aggregate_stats(table, suffix="_time", group_keys=["name", "impl_version"]) + stats_table = util_stats.aggregate_stats( + table, suffix="_time", group_keys=["name", "impl_version"] + ) single_size = stats_table[ (stats_table["size"] == 256) | stats_table["size"].isnull() From a580404b46d387b5c86d8690d16b3f6f833455f8 Mon Sep 17 00:00:00 2001 From: joncrall Date: Wed, 17 Jan 2024 12:12:43 -0500 Subject: [PATCH 25/25] Fixed bug --- json_benchmarks/benchmarker/result_analysis.py | 11 +++++++++-- json_benchmarks/benchmarker/util_stats.py | 6 +++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/json_benchmarks/benchmarker/result_analysis.py b/json_benchmarks/benchmarker/result_analysis.py index 108f3de..39bc262 100644 --- a/json_benchmarks/benchmarker/result_analysis.py +++ b/json_benchmarks/benchmarker/result_analysis.py @@ -911,13 +911,20 @@ class ResultAnalysis(ub.NiceRepr): ) from json_benchmarks.benchmarker.util_stats import aggregate_stats - facet_data_groups = dict(list(facet.data.groupby(facet._col_var))) + # print(f'facet._col_var={facet._col_var}') + if facet._col_var is not None: + facet_data_groups = dict(list(facet.data.groupby(facet._col_var))) + else: + facet_data_groups = None # facet_data_group_iter = iter(facet_data_groups.keys()) for ax in facet.axes.ravel(): col_key = ax.get_title().split("=", 1)[-1].strip() # col_key = next(facet_data_group_iter) - col_data = facet_data_groups[col_key] + if facet_data_groups is not None: + col_data = facet_data_groups[col_key] + else: + col_data = facet.data col_data["mean_time"] col_data["std_time"] xlabel = plot_kws["x"] diff --git a/json_benchmarks/benchmarker/util_stats.py b/json_benchmarks/benchmarker/util_stats.py index 2eaa32c..38cf2e0 100644 --- a/json_benchmarks/benchmarker/util_stats.py +++ b/json_benchmarks/benchmarker/util_stats.py @@ -190,8 +190,8 @@ def combine_stats(s1, s2): >>> assert np.allclose(compare.raw, compare.combo) References: - https://stackoverflow.com/questions/7753002/adding-combining-standard-deviations - https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups + .. [SO7753002] https://stackoverflow.com/questions/7753002/adding-combining-standard-deviations + .. [SO2971315] https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups """ stats = [s1, s2] data = { @@ -201,7 +201,7 @@ def combine_stats(s1, s2): "min": np.array([s["min"] for s in stats]), "max": np.array([s["max"] for s in stats]), } - combine_stats_arrs(data) + return combine_stats_arrs(data) def combine_stats_arrs(data):