From 2c47332ee3a5a948bba4262b1d9ad6c67011748b Mon Sep 17 00:00:00 2001
From: joncrall <jon.crall@kitware.com>
Date: Mon, 9 May 2022 13:54:30 -0400
Subject: [PATCH 01/25] working on benchmark framework with t-test analysis

---
 tests/benchmark3.py | 930 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 930 insertions(+)
 create mode 100644 tests/benchmark3.py

diff --git a/tests/benchmark3.py b/tests/benchmark3.py
new file mode 100644
index 0000000..adcc44e
--- /dev/null
+++ b/tests/benchmark3.py
@@ -0,0 +1,930 @@
+"""
+Roadmap:
+
+    - [ ]
+"""
+
+import random
+import sys
+
+import timerit
+import ubelt as ub
+
+import pandas as pd
+import ujson
+import json
+
+import kwarray
+import warnings
+import math
+import scipy
+import numpy as np
+import itertools as it
+import scipy.stats  # NOQA
+
+
+def data_lut(input, size):
+    if input == "Array with UTF-8 strings":
+        test_object = []
+        for x in range(size):
+            test_object.append(
+                "نظام الحكم سلطاني وراثي "
+                "في الذكور من ذرية السيد تركي بن سعيد بن سلطان ويشترط فيمن يختار لولاية"
+                " الحكم من بينهم ان يكون مسلما رشيدا عاقلا ًوابنا شرعيا لابوين عمانيين "
+            )
+        return test_object
+    elif input == "Array with doubles":
+        test_object = []
+        for x in range(256):
+            test_object.append(sys.maxsize * random.random())
+    else:
+        raise KeyError(input)
+
+
+def get_instance_info():
+    """
+    Get information about the machine and version of the library we are running
+    the benchmarks on.
+
+    Requirements:
+        cpuinfo
+    """
+    import cpuinfo
+    import datetime
+    start_time = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat()
+    cpu_brand = cpuinfo.get_cpu_info()['brand_raw']
+    instance_info = {
+        'cpu_brand': cpu_brand,
+        'start_time': start_time,
+    }
+    return instance_info
+
+
+def benchmark_json_dumps():
+
+    JSON_IMPLS = {
+        "ujson": ujson,  # Our json
+        "json": json,  # Python's json
+    }
+
+    if True:
+        import nujson
+
+        JSON_IMPLS["nujson"] = nujson
+        import orjson
+
+        JSON_IMPLS["nujson"] = orjson
+        import simplejson
+
+        JSON_IMPLS["simplejson"] = simplejson
+
+    version_infos = {k: v.__version__ for k, v in JSON_IMPLS.items()}
+
+    def method_lut(impl):
+        return JSON_IMPLS[impl].dumps
+
+    # Change params here to modify number of trials
+    ti = timerit.Timerit(1000, bestof=10, verbose=1)
+
+    # if True, record every trail run and show variance in seaborn
+    # if False, use the standard timerit min/mean measures
+    RECORD_ALL = 1
+
+    # These are the parameters that we benchmark over
+    basis = {
+        "input": [
+            "Array with UTF-8 strings",
+            "Array with doubles",
+        ],
+        "size": [1, 32, 256, 1024, 2048],
+        "impl": list(JSON_IMPLS.keys()),
+    }
+    xlabel = "size"
+    # Set these to empty lists if they are not used
+    group_labels = {
+        "col": ["input"],
+        "hue": ["impl"],
+        "size": [],
+    }
+    grid_iter = list(ub.named_product(basis))
+
+    instance_info = get_instance_info()
+
+    # For each variation of your experiment, create a row.
+    rows = []
+    for params in grid_iter:
+        group_keys = {}
+        for gname, labels in group_labels.items():
+            group_keys[gname + "_key"] = ub.repr2(
+                ub.dict_isect(params, labels), compact=1, si=1
+            )
+        key = ub.repr2(params, compact=1, si=1)
+        # Make any modifications you need to compute input kwargs for each
+        # method here.
+        impl = params["impl"]
+        impl_version = version_infos[impl]
+        params["impl_version"] = impl_version
+        method = method_lut(impl)
+        data = data_lut(params["input"], params["size"])
+        # Timerit will run some user-specified number of loops.
+        # and compute time stats with similar methodology to timeit
+        for timer in ti.reset(key):
+            # Put any setup logic you dont want to time here.
+            # ...
+            with timer:
+                # Put the logic you want to time here
+                method(data)
+
+        if RECORD_ALL:
+            # Seaborn will show the variance if this is enabled, otherwise
+            # use the robust timerit mean / min times
+            # chunk_iter = ub.chunks(ti.times, ti.bestof)
+            # times = list(map(min, chunk_iter))  # TODO: timerit method for this
+            times = ti.robust_times()
+            for time in times:
+                row = {
+                    "time": time,
+                    "key": key,
+                    "ti_bestof": ti.bestof,
+                    **instance_info,
+                    **group_keys,
+                    **params,
+                }
+                rows.append(row)
+        else:
+            row = {
+                "mean": ti.mean(),
+                "std": ti.std(),
+                "min": ti.min(),
+                "key": key,
+                "ti_num": ti.num,
+                "ti_bestof": ti.bestof,
+                **instance_info,
+                **group_keys,
+                **params,
+            }
+            rows.append(row)
+
+    bench_results_dpath = ub.Path(ujson.__file__).parent / 'benchmark_results'
+    bench_results_dpath.ensuredir()
+    timestamp = instance_info['start_time'].replace(':', '')
+    bench_results_fpath = bench_results_dpath / 'benchmarks_{}.json'.format(timestamp)
+
+    with open(bench_results_fpath, 'w') as file:
+        json.dump(rows, file)
+
+    benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL)
+
+
+def benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL):
+
+    USE_OPENSKILL = True
+
+    time_key = "time" if RECORD_ALL else "min"
+
+    # The rows define a long-form pandas data array.
+    # Data in long-form makes it very easy to use seaborn.
+    data = pd.DataFrame(rows)
+    data = data.sort_values(time_key)
+
+    if RECORD_ALL:
+        # Show the min / mean if we record all
+        min_times = data.groupby("key").min().rename({"time": "min"}, axis=1)
+        mean_times = (
+            data.groupby("key")[["time"]].mean().rename({"time": "mean"}, axis=1)
+        )
+        stats_data = pd.concat([min_times, mean_times], axis=1)
+        stats_data = stats_data.sort_values("min")
+    else:
+        stats_data = data
+
+    if USE_OPENSKILL:
+        # Track the "skill" of each method
+        # The idea is that each setting of parameters is a game, and each
+        # "impl" is a player. We rank the players by which is fastest, and
+        # update their ranking according to the Weng-Lin Bayes ranking model.
+        # This does not take the fact that some "games" (i.e.  parameter
+        # settings) are more important than others, but it should be fairly
+        # robust on average.
+        skillboard = SkillTracker(basis["impl"])
+
+    other_keys = sorted(
+        set(stats_data.columns)
+        - {"key", "impl", "min", "mean", "hue_key", "size_key", "style_key"}
+    )
+    for params, variants in stats_data.groupby(other_keys):
+        variants = variants.sort_values("mean")
+        ranking = variants["impl"].reset_index(drop=True)
+
+        mean_speedup = variants["mean"].max() / variants["mean"]
+        stats_data.loc[mean_speedup.index, "mean_speedup"] = mean_speedup
+        min_speedup = variants["min"].max() / variants["min"]
+        stats_data.loc[min_speedup.index, "min_speedup"] = min_speedup
+
+        if USE_OPENSKILL:
+            skillboard.observe(ranking)
+
+    print("Statistics:")
+    print(stats_data)
+
+    if USE_OPENSKILL:
+        win_probs = skillboard.predict_win()
+        win_probs = ub.sorted_vals(win_probs, reverse=True)
+        print(
+            "Aggregated Rankings = {}".format(
+                ub.repr2(win_probs, nl=1, precision=4, align=":")
+            )
+        )
+
+    plot = True
+    if plot:
+        # import seaborn as sns
+        # kwplot autosns works well for IPython and script execution.
+        # not sure about notebooks.
+        import seaborn as sns
+
+        sns.set()
+        from matplotlib import pyplot as plt
+
+        plotkw = {}
+        for gname, labels in group_labels.items():
+            if labels:
+                plotkw[gname] = gname + "_key"
+
+        # Your variables may change
+        # ax = plt.figure().gca()
+        col = plotkw.pop("col")
+        facet = sns.FacetGrid(data, col=col, sharex=False, sharey=False)
+        facet.map_dataframe(sns.lineplot, x=xlabel, y=time_key, marker="o", **plotkw)
+        facet.add_legend()
+        # sns.lineplot(data=data, )
+        # ax.set_title('JSON Benchmarks')
+        # ax.set_xlabel('Size')
+        # ax.set_ylabel('Time')
+        # ax.set_xscale('log')
+        # ax.set_yscale('log')
+
+        try:
+            __IPYTHON__
+        except NameError:
+            plt.show()
+
+
+class Result(ub.NiceRepr):
+    """
+    Storage of names, parameters, and quality metrics for a single experiment.
+
+    Attributes:
+        name (str | None):
+            Name of the experiment. Optional. This is unused in the analysis.
+            (i.e. names will never be used computationally. Use them for keys)
+
+        params (Dict[str, object]): configuration of the experiment.
+            This is a dictionary mapping a parameter name to its value.
+
+        metrics (Dict[str, float]): quantitative results of the experiment
+            This is a dictionary for each quality metric computed on this
+            result.
+
+        meta (Dict | None): any other metadata about this result.
+            This is unused in the analysis.
+
+    Example:
+        >>> self = Result.demo(rng=32)
+        >>> print('self = {}'.format(self))
+        self = <Result(name=53f57161,f1=0.33,acc=0.75,param1=1,param2=6.67,param3=a)>
+    """
+    def __init__(self, name, params, metrics, meta=None):
+        self.name = name
+        self.params = params
+        self.metrics = metrics
+        self.meta = meta
+
+    def to_dict(self):
+        row = ub.dict_union({'name': self.name}, self.metrics, self.params)
+        return row
+
+    def __nice__(self):
+        row = self.to_dict()
+        text = ub.repr2(row, compact=True, precision=2, sort=0)
+        return text
+
+    @classmethod
+    def demo(cls, rng=None):
+        import numpy as np
+        import string
+        rng = kwarray.ensure_rng(rng)
+        demo_param_space = {
+            'param1': list(range(3)),
+            'param2': np.linspace(0, 10, 10),
+            'param3': list(string.ascii_lowercase[0:3]),
+        }
+        params = {k: rng.choice(b) for k, b in demo_param_space.items()}
+        metrics = {
+            'f1': rng.rand(),
+            'acc': rng.rand(),
+        }
+        name = ub.hash_data(params)[0:8]
+        self = cls(name, params, metrics)
+        return self
+
+
+class ResultAnalysis(ub.NiceRepr):
+    """
+    Groups and runs stats on results
+
+    Runs statistical tests on sets of configuration-metrics pairs
+
+    Attributes:
+        results (List[Result]): list of results
+
+        ignore_metrics (Set[str]): metrics to ignore
+
+        ignore_params (Set[str]): parameters to ignore
+
+        metric_objectives (Dict[str, str]):
+            indicate if each metrix should be maximized "max" or minimized
+            "min"
+
+        metrics (List[str]):
+            only consider these metrics
+
+        abalation_orders (Set[int]):
+            The number of parameters to be held constant in each statistical
+            grouping. Defaults to 1, so it groups together results where 1
+            variable is held constant. Including 2 will include pairwise
+            settings of parameters to be held constant. Using -1 or -2 means
+            all but 1 or 2 parameters will be held constant, repsectively.
+
+        default_objective (str):
+            assume max or min for unknown metrics
+
+    Example:
+        >>> self = ResultAnalysis.demo()
+        >>> self.analysis()
+
+    Example:
+        >>> # Given a list of experiments, configs, and results
+        >>> # Create a ResultAnalysis object
+        >>> results = ResultAnalysis([
+        >>>     Result('expt0', {'param1': 2, 'param3': 'b'}, {'f1': 0.75}),
+        >>>     Result('expt1', {'param1': 0, 'param3': 'c'}, {'f1': 0.92}),
+        >>>     Result('expt2', {'param1': 1, 'param3': 'b'}, {'f1': 0.77}),
+        >>>     Result('expt3', {'param1': 1, 'param3': 'a'}, {'f1': 0.67}),
+        >>>     Result('expt4', {'param1': 0, 'param3': 'c'}, {'f1': 0.98}),
+        >>>     Result('expt5', {'param1': 2, 'param3': 'a'}, {'f1': 0.86}),
+        >>>     Result('expt6', {'param1': 1, 'param3': 'c'}, {'f1': 0.77}),
+        >>>     Result('expt7', {'param1': 1, 'param3': 'c'}, {'f1': 0.41}),
+        >>>     Result('expt8', {'param1': 1, 'param3': 'a'}, {'f1': 0.64}),
+        >>>     Result('expt9', {'param1': 0, 'param3': 'b'}, {'f1': 0.95}),
+        >>> ])
+        >>> # Calling the analysis method prints something like the following
+        >>> results.analysis()
+
+        PARAMETER 'param1' - f1
+        =======================
+        f1       mean       std   max   min  num  best
+        param1
+        0       0.950  0.030000  0.98  0.92  3.0  0.98
+        2       0.805  0.077782  0.86  0.75  2.0  0.86
+        1       0.652  0.147377  0.77  0.41  5.0  0.77
+
+        ANOVA hypothesis (roughly): the param 'param1' has no effect on the metric
+            Reject this hypothesis if the p value is less than a threshold
+          Rank-ANOVA: p=0.0397
+          Mean-ANOVA: p=0.0277
+
+        Pairwise T-Tests
+          Is param1=0 about as good as param1=2?
+            ttest_ind:  p=0.2058
+          Is param1=1 about as good as param1=2?
+            ttest_ind:  p=0.1508
+
+
+        PARAMETER 'param3' - f1
+        =======================
+        f1          mean       std   max   min  num  best
+        param3
+        c       0.770000  0.255734  0.98  0.41  4.0  0.98
+        b       0.823333  0.110151  0.95  0.75  3.0  0.95
+        a       0.723333  0.119304  0.86  0.64  3.0  0.86
+
+        ANOVA hypothesis (roughly): the param 'param3' has no effect on the metric
+            Reject this hypothesis if the p value is less than a threshold
+          Rank-ANOVA: p=0.5890
+          Mean-ANOVA: p=0.8145
+
+        Pairwise T-Tests
+          Is param3=b about as good as param3=c?
+            ttest_ind:  p=0.7266
+          Is param3=a about as good as param3=b?
+            ttest_ind:  p=0.3466
+            ttest_rel:  p=0.3466
+          Is param3=a about as good as param3=c?
+            ttest_ind:  p=0.7626
+    """
+
+    def __init__(self, results, metrics=None, ignore_params=None,
+                 ignore_metrics=None, metric_objectives=None,
+                 abalation_orders={1}, default_objective='max'):
+        self.results = results
+        if ignore_metrics is None:
+            ignore_metrics = set()
+        if ignore_params is None:
+            ignore_params = set()
+        self.ignore_params = ignore_params
+        self.ignore_metrics = ignore_metrics
+
+        self.abalation_orders = abalation_orders
+        self.default_objective = default_objective
+
+        # encode if we want to maximize or minimize a metric
+        default_metric_to_objective = {
+            'ap': 'max',
+            'acc': 'max',
+            'f1': 'max',
+            #
+            'loss': 'min',
+            'brier': 'min',
+        }
+        if metric_objectives is None:
+            metric_objectives = {}
+
+        self.metric_objectives = default_metric_to_objective.copy()
+        self.metric_objectives.update(metric_objectives)
+
+        self.metrics = metrics
+        self.statistics = None
+
+        self._description = {}
+        self._description['built'] = False
+        self._description['num_results'] = len(self.results)
+
+    def __nice__(self):
+        # if len(self._description) == 0:
+        #     return 'unbuilt'
+        # else:
+        return ub.repr2(self._description, si=1, sv=1)
+
+    @classmethod
+    def demo(cls, num=10, rng=None):
+        rng = kwarray.ensure_rng(rng)
+        results = [Result.demo(rng=rng) for _ in range(num)]
+        self = cls(results, metrics={'f1', 'acc'})
+        return self
+
+    def run(self):
+        self.build()
+        self.report()
+
+    def analysis(self):
+        # alias for run
+        return self.run()
+        self.build()
+        self.report()
+
+    @ub.memoize_property
+    def table(self):
+        rows = [r.to_dict() for r in self.results]
+        table = pd.DataFrame(rows)
+        return table
+
+    def metric_table(self):
+        rows = [r.to_dict() for r in self.results]
+        table = pd.DataFrame(rows)
+        return table
+
+    @ub.memoize_property
+    def varied(self):
+        config_rows = [r.params for r in self.results]
+        sentinel = object()
+        # pd.DataFrame(config_rows).channels
+        varied = dict(ub.varied_values(config_rows, default=sentinel, min_variations=1))
+        # remove nans
+        varied = {
+            k: {v for v in vs if not (isinstance(v, float) and math.isnan(v))}
+            for k, vs in varied.items()}
+        varied = {k: vs for k, vs in varied.items() if len(vs)}
+        return varied
+
+    def abalation_groups(self, param):
+        """
+        Example:
+            >>> self = ResultAnalysis.demo()
+            >>> param = 'param2'
+            >>> self.abalation_groups(param)
+        """
+        table = self.table
+        config_rows = [r.params for r in self.results]
+        config_keys = list(map(set, config_rows))
+        if self.ignore_params:
+            config_keys = [c - self.ignore_params for c in config_keys]
+        isect_params = set.intersection(*config_keys)
+        other_params = sorted(isect_params - {param})
+        groups = []
+        for key, group in table.groupby(other_params, dropna=False):
+            if len(group) > 1:
+                groups.append(group)
+        return groups
+
+    def abalate_one(self, param):
+        """
+        Example:
+            >>> self = ResultAnalysis.demo()
+            >>> param = 'param2'
+            >>> # xdoctest: +REQUIRES(module:openskill)
+            >>> self.abalate_one(param)
+        """
+        import itertools as it
+        if self.table is None:
+            self.table = self.build_table()
+        param_unique_vals = self.table[param].unique().tolist()
+        score_improvements = ub.ddict(list)
+        scored_obs = []
+        skillboard = SkillTracker(param_unique_vals)
+        groups = self.abalation_groups(param)
+
+        for group in groups:
+            for metric_key in self.metrics:
+                ascending = self._objective_is_ascending(metric_key)
+
+                group = group.sort_values(metric_key, ascending=ascending)
+                subgroups = group.groupby(param)
+                if ascending:
+                    best_idx = subgroups[metric_key].idxmax()
+                else:
+                    best_idx = subgroups[metric_key].idxmin()
+                best_group = group.loc[best_idx]
+                best_group = best_group.sort_values(metric_key, ascending=ascending)
+
+                for x1, x2 in it.product(best_group.index, best_group.index):
+                    if x1 != x2:
+                        r1 = best_group.loc[x1]
+                        r2 = best_group.loc[x2]
+                        k1 = r1[param]
+                        k2 = r2[param]
+                        diff = r1[metric_key] - r2[metric_key]
+                        score_improvements[(k1, k2)].append(diff)
+
+                # metric_vals = best_group[metric_key].values
+                # diffs = metric_vals[None, :] - metric_vals[:, None]
+                best_group.set_index(param)
+                # best_group[param]
+                # best_group[metric_key].diff()
+                scored_ranking = best_group[[param, metric_key]].reset_index(drop=True)
+                scored_obs.append(scored_ranking)
+                skillboard.observe(scored_ranking[param])
+
+        print('skillboard.ratings = {}'.format(ub.repr2(skillboard.ratings, nl=1, align=':')))
+        win_probs = skillboard.predict_win()
+        print(f'win_probs={win_probs}')
+        for key, improves in score_improvements.items():
+            k1, k2 = key
+            improves = np.array(improves)
+            pos_delta = improves[improves > 0]
+            print(f'\nWhen {param}={k1} is better than {param}={k2}')
+            print(pd.DataFrame([pd.Series(pos_delta).describe().T]))
+        return scored_obs
+        # self.varied[param]
+
+    def _objective_is_ascending(self, metric_key):
+        """
+        Return True if we should minimize the objective (lower is better)
+        Return False if we should maximize the objective (higher is better)
+        """
+        objective = self.metric_objectives.get(metric_key, None)
+        if objective is None:
+            warnings.warn(f'warning assume {self.default_objective} for {metric_key=}')
+            objective = self.default_objective
+        ascending = (objective == 'min')
+        return ascending
+
+    def test_group(self, param_group, metric_key):
+        """
+        Get stats for a particular metric / constant group
+
+        Args:
+            param_group (List[str]): group of parameters to hold constant.
+            metric_key (str): The metric to test.
+
+        Returns:
+            dict
+            # TODO : document these stats clearly and accurately
+
+        Example:
+            >>> self = ResultAnalysis.demo(num=30)
+            >>> print(self.table)
+            >>> param_group = ['param2']
+            >>> metric_key = 'f1'
+            >>> stats_row = self.test_group(param_group, metric_key)
+            >>> print('stats_row = {}'.format(ub.repr2(stats_row, nl=2, precision=2)))
+            >>> # ---
+            >>> self.build()
+            >>> self.report()
+        """
+        param_group_name = ','.join(param_group)
+        stats_row = {
+            'param_name': param_group_name,
+            'metric': metric_key,
+        }
+        # param_values = varied[param_name]
+        # stats_row['param_values'] = param_values
+        ascending = self._objective_is_ascending(metric_key)
+
+        # Find all items with this particular param value
+        value_to_metric_group = {}
+        value_to_metric_stats = {}
+        value_to_metric = {}
+
+        varied_cols = sorted(self.varied.keys())
+
+        # Not sure if this is the right name, these are the other param keys
+        # that we are not directly investigating, but might have an impact.
+        # We use these to select comparable rows for pairwise t-tests
+        nuisance_cols = sorted(set(self.varied.keys()) - set(param_group))
+
+        for param_value, group in self.table.groupby(param_group):
+            metric_group = group[['name', metric_key] + varied_cols]
+            metric_vals = metric_group[metric_key]
+            metric_vals = metric_vals.dropna()
+            if len(metric_vals) > 0:
+                metric_stats = metric_vals.describe()
+                value_to_metric_stats[param_value] = metric_stats
+                value_to_metric_group[param_value] = metric_group
+                value_to_metric[param_value] = metric_vals.values
+
+        moments = pd.DataFrame(value_to_metric_stats).T
+        moments = moments.sort_values('mean', ascending=ascending)
+        moments.index.name = param_group_name
+        moments.columns.name = metric_key
+        ranking = moments['mean'].index.to_list()
+        param_to_rank = ub.invert_dict(dict(enumerate(ranking)))
+
+        # Determine a set of value pairs to do pairwise comparisons on
+        value_pairs = ub.oset()
+        value_pairs.update(map(frozenset, ub.iter_window(moments.index, 2)))
+        value_pairs.update(map(frozenset, ub.iter_window(moments.sort_values('mean', ascending=ascending).index, 2)))
+
+        # https://en.wikipedia.org/wiki/Kruskal%E2%80%93Wallis_one-way_analysis_of_variance
+        # If the researcher can make the assumptions of an identically
+        # shaped and scaled distribution for all groups, except for any
+        # difference in medians, then the null hypothesis is that the
+        # medians of all groups are equal, and the alternative
+        # hypothesis is that at least one population median of one
+        # group is different from the population median of at least one
+        # other group.
+        try:
+            anova_krus_result = scipy.stats.kruskal(*value_to_metric.values())
+        except ValueError:
+            anova_krus_result = scipy.stats.stats.KruskalResult(np.nan, np.nan)
+
+        # https://en.wikipedia.org/wiki/One-way_analysis_of_variance
+        # The One-Way ANOVA tests the null hypothesis, which states
+        # that samples in all groups are drawn from populations with
+        # the same mean values
+        if len(value_to_metric) > 1:
+            anova_1way_result = scipy.stats.f_oneway(*value_to_metric.values())
+        else:
+            anova_1way_result = scipy.stats.stats.F_onewayResult(np.nan, np.nan)
+
+        stats_row['anova_rank_H'] = anova_krus_result.statistic
+        stats_row['anova_rank_p'] = anova_krus_result.pvalue
+        stats_row['anova_mean_F'] = anova_1way_result.statistic
+        stats_row['anova_mean_p'] = anova_1way_result.pvalue
+        stats_row['moments'] = moments
+
+        pairwise_statistics = []
+        for pair in value_pairs:
+            pair_statistics = {}
+            # try:
+            #     param_val1, param_val2 = sorted(pair)
+            # except Exception:
+            #     param_val1, param_val2 = (pair)
+            param_val1, param_val2 = pair
+
+            metric_vals1 = value_to_metric[param_val1]
+            metric_vals2 = value_to_metric[param_val2]
+
+            rank1 = param_to_rank[param_val1]
+            rank2 = param_to_rank[param_val2]
+            pair_statistics['winner'] = param_val1 if rank1 < rank2 else param_val2
+            pair_statistics['value1'] = param_val1
+            pair_statistics['value2'] = param_val2
+            pair_statistics['n1'] = len(metric_vals1)
+            pair_statistics['n2'] = len(metric_vals2)
+            ttest_ind_result = scipy.stats.ttest_ind(metric_vals1, metric_vals2, equal_var=False)
+            pair_statistics['ttest_ind'] = ttest_ind_result
+
+            # Do relative checks, need to find comparable subgroups
+            metric_group1 = value_to_metric_group[param_val1]
+            metric_group2 = value_to_metric_group[param_val2]
+            nuisance_vals1 = metric_group1[nuisance_cols]
+            nuisance_vals2 = metric_group2[nuisance_cols]
+            nk_to_group1 = dict(list(nuisance_vals1.groupby(nuisance_cols)))
+            nk_to_group2 = dict(list(nuisance_vals2.groupby(nuisance_cols)))
+            common = set(nk_to_group1) & set(nk_to_group2)
+            comparable_indexes1 = []
+            comparable_indexes2 = []
+            if common:
+                for nk in common:
+                    group1 = nk_to_group1[nk]
+                    group2 = nk_to_group2[nk]
+                    for i, j in it.product(group1.index, group2.index):
+                        comparable_indexes1.append(i)
+                        comparable_indexes2.append(j)
+
+                comparable_groups1 = metric_group1.loc[comparable_indexes1, metric_key]
+                comparable_groups2 = metric_group2.loc[comparable_indexes2, metric_key]
+
+                # Does this need to have the values aligned?
+                ttest_rel_result = scipy.stats.ttest_rel(comparable_groups1, comparable_groups2)
+                pair_statistics['ttest_rel'] = ttest_rel_result
+            pairwise_statistics.append(pair_statistics)
+
+        stats_row['pairwise'] = pairwise_statistics
+        return stats_row
+
+    def build(self):
+        import itertools as it
+        if len(self.results) < 2:
+            raise Exception('need at least 2 results')
+
+        varied = self.varied.copy()
+        if self.ignore_params:
+            for k in self.ignore_params:
+                varied.pop(k, None)
+
+        # Experimental:
+        # Find Auto-abalation groups
+        # TODO: when the group size is -1, instead of showing all of the group
+        # settings, for each group setting do the k=1 analysis within that group
+        varied_param_names = list(varied.keys())
+        num_varied_params = len(varied)
+        held_constant_orders = {num_varied_params + i if i < 0 else i for i in self.abalation_orders}
+        held_constant_orders = [i for i in held_constant_orders if i > 0]
+        held_constant_groups = []
+        for k in held_constant_orders:
+            held_constant_groups.extend(
+                list(map(list, it.combinations(varied_param_names, k))))
+
+        if self.metrics is None:
+            avail_metrics = set.intersection(*[set(r.metrics.keys()) for r in self.results])
+            metrics_of_interest = sorted(avail_metrics - set(self.ignore_metrics))
+        else:
+            metrics_of_interest = self.metrics
+        self.metrics_of_interest = metrics_of_interest
+        self._description['metrics_of_interest'] = metrics_of_interest
+        self._description['num_groups'] = len(held_constant_groups)
+
+        # Analyze the impact of each parameter
+        self.statistics = statistics = []
+        for param_group in held_constant_groups:
+            for metric_key in metrics_of_interest:
+                stats_row = self.test_group(param_group, metric_key)
+                statistics.append(stats_row)
+
+        self.stats_table = pd.DataFrame([
+            ub.dict_diff(d, {'pairwise', 'param_values', 'moments'})
+            for d in self.statistics])
+
+        if len(self.stats_table):
+            self.stats_table = self.stats_table.sort_values('anova_rank_p')
+
+        self._description['built'] = True
+
+    def report(self):
+        p_threshold = 0.05
+        stat_groups = ub.group_items(self.statistics, key=lambda x: x['param_name'])
+        stat_groups_items = list(stat_groups.items())
+
+        # Modify this order to change the grouping pattern
+        grid = ub.named_product({
+            'stat_group_item': stat_groups_items,
+            'metrics': self.metrics_of_interest,
+        })
+        for grid_item in grid:
+            metric_key = grid_item['metrics']
+            stat_groups_item = grid_item['stat_group_item']
+
+            param_name, stat_group = stat_groups_item
+            stats_row = ub.group_items(stat_group, key=lambda x: x['metric'])[metric_key][0]
+            title = ('PARAMETER {!r} - {}'.format(param_name, metric_key))
+            print('\n\n')
+            print(title)
+            print('=' * len(title))
+            print(stats_row['moments'])
+            anova_rank_p = stats_row['anova_rank_p']
+            anova_mean_p = stats_row['anova_mean_p']
+            # Rougly speaking
+            print('')
+            print(f'ANOVA: If p is low, the param {param_name!r} might have an effect')
+            print(ub.color_text(f'  Rank-ANOVA: p={anova_rank_p:0.8f}', 'green' if anova_rank_p < p_threshold else None))
+            print(ub.color_text(f'  Mean-ANOVA: p={anova_mean_p:0.8f}', 'green' if anova_mean_p < p_threshold else None))
+            print('')
+            print('Pairwise T-Tests')
+            for pairstat in stats_row['pairwise']:
+                # Is this backwards?
+                value1 = pairstat['value1']
+                value2 = pairstat['value2']
+                winner = pairstat['winner']
+                if value2 == winner:
+                    value1, value2 = value2, value1
+                print(f'  If p is low, {param_name}={value1} may outperform {param_name}={value2}.')
+                if 'ttest_ind' in pairstat:
+                    ttest_ind_result = pairstat['ttest_ind']
+                    print(ub.color_text(f'    ttest_ind:  p={ttest_ind_result.pvalue:0.8f}', 'green' if ttest_ind_result.pvalue < p_threshold else None))
+                if 'ttest_rel' in pairstat:
+                    ttest_rel_result = pairstat['ttest_ind']
+                    print(ub.color_text(f'    ttest_rel:  p={ttest_rel_result.pvalue:0.8f}', 'green' if ttest_rel_result.pvalue < p_threshold else None))
+
+        print(self.stats_table)
+
+    def conclusions(self):
+        conclusions = []
+        for stat in self.statistics:
+            param_name = stat['param_name']
+            metric = stat['metric']
+            for pairstat in stat['pairwise']:
+                value1 = pairstat['value1']
+                value2 = pairstat['value2']
+                winner = pairstat['winner']
+                if value2 == winner:
+                    value1, value2 = value2, value1
+                pvalue = stat = pairstat['ttest_ind'].pvalue
+                txt = (f'p={pvalue:0.8f}, If p is low, {param_name}={value1} may outperform {value2} on {metric}.')
+                conclusions.append(txt)
+        return conclusions
+
+
+class SkillTracker:
+    """
+    Wrapper around openskill
+
+    Args:
+        player_ids (List[T]):
+            a list of ids (usually ints) used to represent each player
+
+    Example:
+        >>> # xdoctest: +REQUIRES(module:openskill)
+        >>> self = SkillTracker([1, 2, 3, 4, 5])
+        >>> self.observe([2, 3])  # Player 2 beat player 3.
+        >>> self.observe([1, 2, 5, 3])  # Player 3 didnt play this round.
+        >>> self.observe([2, 3, 4, 5, 1])  # Everyone played, player 2 won.
+        >>> win_probs = self.predict_win()
+        >>> print('win_probs = {}'.format(ub.repr2(win_probs, nl=1, precision=2)))
+        win_probs = {
+            1: 0.20,
+            2: 0.21,
+            3: 0.19,
+            4: 0.20,
+            5: 0.20,
+        }
+    """
+
+    def __init__(self, player_ids):
+        import openskill
+        self.player_ids = player_ids
+        self.ratings = {m: openskill.Rating() for m in player_ids}
+        self.observations = []
+
+    def predict_win(self):
+        """
+        Estimate the probability that a particular player will win given the
+        current ratings.
+
+        Returns:
+            Dict[T, float]: mapping from player ids to win probabilites
+        """
+        from openskill import predict_win
+        teams = [[p] for p in list(self.ratings.keys())]
+        ratings = [[r] for r in self.ratings.values()]
+        probs = predict_win(ratings)
+        win_probs = {team[0]: prob for team, prob in zip(teams, probs)}
+        return win_probs
+
+    def observe(self, ranking):
+        """
+        After simulating a round, pass the ranked order of who won
+        (winner is first, looser is last) to this function. And it
+        updates the rankings.
+
+        Args:
+            ranking (List[T]):
+                ranking of all the players that played in this round
+                winners are at the front (0-th place) of the list.
+        """
+        import openskill
+        self.observations.append(ranking)
+        ratings = self.ratings
+        team_standings = [[r] for r in ub.take(ratings, ranking)]
+        new_values = openskill.rate(team_standings)  # Not inplace
+        new_ratings = [openskill.Rating(*new[0]) for new in new_values]
+        ratings.update(ub.dzip(ranking, new_ratings))
+
+
+if __name__ == "__main__":
+    """
+    CommandLine:
+        python ~/code/ultrajson/tests/benchmark3.py
+    """
+    benchmark_json_dumps()

From da6428296d2ace46f822ed86a703a88226ad1cea Mon Sep 17 00:00:00 2001
From: joncrall <jon.crall@kitware.com>
Date: Mon, 23 May 2022 00:52:30 -0400
Subject: [PATCH 02/25] Working on benchmarks with details statistical analysis

---
 tests/benchmark3.py                  | 918 ++-------------------------
 tests/benchmarker/__init__.py        |  35 +
 tests/benchmarker/_test_ttest.py     |  28 +
 tests/benchmarker/aggregate.py       |  68 ++
 tests/benchmarker/benchmarker.py     | 230 +++++++
 tests/benchmarker/process_context.py | 103 +++
 tests/benchmarker/result_analysis.py | 722 +++++++++++++++++++++
 tests/benchmarker/util_json.py       | 233 +++++++
 tests/benchmarker/visualize.py       | 113 ++++
 9 files changed, 1596 insertions(+), 854 deletions(-)
 create mode 100644 tests/benchmarker/__init__.py
 create mode 100644 tests/benchmarker/_test_ttest.py
 create mode 100644 tests/benchmarker/aggregate.py
 create mode 100644 tests/benchmarker/benchmarker.py
 create mode 100644 tests/benchmarker/process_context.py
 create mode 100644 tests/benchmarker/result_analysis.py
 create mode 100644 tests/benchmarker/util_json.py
 create mode 100644 tests/benchmarker/visualize.py

diff --git a/tests/benchmark3.py b/tests/benchmark3.py
index adcc44e..181b2a4 100644
--- a/tests/benchmark3.py
+++ b/tests/benchmark3.py
@@ -3,25 +3,10 @@ Roadmap:
 
     - [ ]
 """
-
 import random
 import sys
-
-import timerit
 import ubelt as ub
 
-import pandas as pd
-import ujson
-import json
-
-import kwarray
-import warnings
-import math
-import scipy
-import numpy as np
-import itertools as it
-import scipy.stats  # NOQA
-
 
 def data_lut(input, size):
     if input == "Array with UTF-8 strings":
@@ -41,55 +26,54 @@ def data_lut(input, size):
         raise KeyError(input)
 
 
-def get_instance_info():
-    """
-    Get information about the machine and version of the library we are running
-    the benchmarks on.
+def available_json_impls():
+    JSON_IMPLS = {}
 
-    Requirements:
-        cpuinfo
-    """
-    import cpuinfo
-    import datetime
-    start_time = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat()
-    cpu_brand = cpuinfo.get_cpu_info()['brand_raw']
-    instance_info = {
-        'cpu_brand': cpu_brand,
-        'start_time': start_time,
-    }
-    return instance_info
+    try:
+        import json
+        JSON_IMPLS["json"] = json
+    except ImportError:
+        pass
+
+    try:
+        import ujson
+        JSON_IMPLS["ujson"] = ujson
+    except ImportError:
+        pass
+
+    try:
+        import nujson
+        JSON_IMPLS["nujson"] = nujson
+    except ImportError:
+        pass
+
+    try:
+        import orjson
+        JSON_IMPLS["nujson"] = orjson
+    except ImportError:
+        pass
+
+    try:
+        import simplejson
+        JSON_IMPLS["simplejson"] = simplejson
+    except ImportError:
+        pass
+
+    return JSON_IMPLS
 
 
 def benchmark_json_dumps():
+    # TODO: remove this hack
+    sys.path.append(ub.expandpath('~/code/ultrajson/tests'))
+    from benchmarker import Benchmarker
 
-    JSON_IMPLS = {
-        "ujson": ujson,  # Our json
-        "json": json,  # Python's json
-    }
-
-    if True:
-        import nujson
-
-        JSON_IMPLS["nujson"] = nujson
-        import orjson
-
-        JSON_IMPLS["nujson"] = orjson
-        import simplejson
-
-        JSON_IMPLS["simplejson"] = simplejson
+    JSON_IMPLS = available_json_impls()
 
     version_infos = {k: v.__version__ for k, v in JSON_IMPLS.items()}
 
     def method_lut(impl):
         return JSON_IMPLS[impl].dumps
 
-    # Change params here to modify number of trials
-    ti = timerit.Timerit(1000, bestof=10, verbose=1)
-
-    # if True, record every trail run and show variance in seaborn
-    # if False, use the standard timerit min/mean measures
-    RECORD_ALL = 1
-
     # These are the parameters that we benchmark over
     basis = {
         "input": [
@@ -99,26 +83,17 @@ def benchmark_json_dumps():
         "size": [1, 32, 256, 1024, 2048],
         "impl": list(JSON_IMPLS.keys()),
     }
-    xlabel = "size"
-    # Set these to empty lists if they are not used
-    group_labels = {
-        "col": ["input"],
-        "hue": ["impl"],
-        "size": [],
-    }
-    grid_iter = list(ub.named_product(basis))
 
-    instance_info = get_instance_info()
+    benchmark = Benchmarker(
+        name='bench_json_dumps',
+        # Change params here to modify number of trials
+        num=100,
+        bestof=10,
+        basis=basis,
+    )
 
     # For each variation of your experiment, create a row.
-    rows = []
-    for params in grid_iter:
-        group_keys = {}
-        for gname, labels in group_labels.items():
-            group_keys[gname + "_key"] = ub.repr2(
-                ub.dict_isect(params, labels), compact=1, si=1
-            )
-        key = ub.repr2(params, compact=1, si=1)
+    for params in benchmark.iter_params():
         # Make any modifications you need to compute input kwargs for each
         # method here.
         impl = params["impl"]
@@ -128,798 +103,33 @@ def benchmark_json_dumps():
         data = data_lut(params["input"], params["size"])
         # Timerit will run some user-specified number of loops.
         # and compute time stats with similar methodology to timeit
-        for timer in ti.reset(key):
+        for timer in benchmark.measure():
             # Put any setup logic you dont want to time here.
             # ...
             with timer:
                 # Put the logic you want to time here
                 method(data)
 
-        if RECORD_ALL:
-            # Seaborn will show the variance if this is enabled, otherwise
-            # use the robust timerit mean / min times
-            # chunk_iter = ub.chunks(ti.times, ti.bestof)
-            # times = list(map(min, chunk_iter))  # TODO: timerit method for this
-            times = ti.robust_times()
-            for time in times:
-                row = {
-                    "time": time,
-                    "key": key,
-                    "ti_bestof": ti.bestof,
-                    **instance_info,
-                    **group_keys,
-                    **params,
-                }
-                rows.append(row)
-        else:
-            row = {
-                "mean": ti.mean(),
-                "std": ti.std(),
-                "min": ti.min(),
-                "key": key,
-                "ti_num": ti.num,
-                "ti_bestof": ti.bestof,
-                **instance_info,
-                **group_keys,
-                **params,
-            }
-            rows.append(row)
-
-    bench_results_dpath = ub.Path(ujson.__file__).parent / 'benchmark_results'
-    bench_results_dpath.ensuredir()
-    timestamp = instance_info['start_time'].replace(':', '')
-    bench_results_fpath = bench_results_dpath / 'benchmarks_{}.json'.format(timestamp)
-
-    with open(bench_results_fpath, 'w') as file:
-        json.dump(rows, file)
-
-    benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL)
-
-
-def benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL):
-
-    USE_OPENSKILL = True
-
-    time_key = "time" if RECORD_ALL else "min"
-
-    # The rows define a long-form pandas data array.
-    # Data in long-form makes it very easy to use seaborn.
-    data = pd.DataFrame(rows)
-    data = data.sort_values(time_key)
-
-    if RECORD_ALL:
-        # Show the min / mean if we record all
-        min_times = data.groupby("key").min().rename({"time": "min"}, axis=1)
-        mean_times = (
-            data.groupby("key")[["time"]].mean().rename({"time": "mean"}, axis=1)
-        )
-        stats_data = pd.concat([min_times, mean_times], axis=1)
-        stats_data = stats_data.sort_values("min")
-    else:
-        stats_data = data
-
-    if USE_OPENSKILL:
-        # Track the "skill" of each method
-        # The idea is that each setting of parameters is a game, and each
-        # "impl" is a player. We rank the players by which is fastest, and
-        # update their ranking according to the Weng-Lin Bayes ranking model.
-        # This does not take the fact that some "games" (i.e.  parameter
-        # settings) are more important than others, but it should be fairly
-        # robust on average.
-        skillboard = SkillTracker(basis["impl"])
-
-    other_keys = sorted(
-        set(stats_data.columns)
-        - {"key", "impl", "min", "mean", "hue_key", "size_key", "style_key"}
-    )
-    for params, variants in stats_data.groupby(other_keys):
-        variants = variants.sort_values("mean")
-        ranking = variants["impl"].reset_index(drop=True)
-
-        mean_speedup = variants["mean"].max() / variants["mean"]
-        stats_data.loc[mean_speedup.index, "mean_speedup"] = mean_speedup
-        min_speedup = variants["min"].max() / variants["min"]
-        stats_data.loc[min_speedup.index, "min_speedup"] = min_speedup
-
-        if USE_OPENSKILL:
-            skillboard.observe(ranking)
-
-    print("Statistics:")
-    print(stats_data)
-
-    if USE_OPENSKILL:
-        win_probs = skillboard.predict_win()
-        win_probs = ub.sorted_vals(win_probs, reverse=True)
-        print(
-            "Aggregated Rankings = {}".format(
-                ub.repr2(win_probs, nl=1, precision=4, align=":")
-            )
-        )
-
-    plot = True
-    if plot:
-        # import seaborn as sns
-        # kwplot autosns works well for IPython and script execution.
-        # not sure about notebooks.
-        import seaborn as sns
-
-        sns.set()
-        from matplotlib import pyplot as plt
-
-        plotkw = {}
-        for gname, labels in group_labels.items():
-            if labels:
-                plotkw[gname] = gname + "_key"
-
-        # Your variables may change
-        # ax = plt.figure().gca()
-        col = plotkw.pop("col")
-        facet = sns.FacetGrid(data, col=col, sharex=False, sharey=False)
-        facet.map_dataframe(sns.lineplot, x=xlabel, y=time_key, marker="o", **plotkw)
-        facet.add_legend()
-        # sns.lineplot(data=data, )
-        # ax.set_title('JSON Benchmarks')
-        # ax.set_xlabel('Size')
-        # ax.set_ylabel('Time')
-        # ax.set_xscale('log')
-        # ax.set_yscale('log')
-
-        try:
-            __IPYTHON__
-        except NameError:
-            plt.show()
-
-
-class Result(ub.NiceRepr):
-    """
-    Storage of names, parameters, and quality metrics for a single experiment.
-
-    Attributes:
-        name (str | None):
-            Name of the experiment. Optional. This is unused in the analysis.
-            (i.e. names will never be used computationally. Use them for keys)
-
-        params (Dict[str, object]): configuration of the experiment.
-            This is a dictionary mapping a parameter name to its value.
-
-        metrics (Dict[str, float]): quantitative results of the experiment
-            This is a dictionary for each quality metric computed on this
-            result.
-
-        meta (Dict | None): any other metadata about this result.
-            This is unused in the analysis.
-
-    Example:
-        >>> self = Result.demo(rng=32)
-        >>> print('self = {}'.format(self))
-        self = <Result(name=53f57161,f1=0.33,acc=0.75,param1=1,param2=6.67,param3=a)>
-    """
-    def __init__(self, name, params, metrics, meta=None):
-        self.name = name
-        self.params = params
-        self.metrics = metrics
-        self.meta = meta
-
-    def to_dict(self):
-        row = ub.dict_union({'name': self.name}, self.metrics, self.params)
-        return row
-
-    def __nice__(self):
-        row = self.to_dict()
-        text = ub.repr2(row, compact=True, precision=2, sort=0)
-        return text
-
-    @classmethod
-    def demo(cls, rng=None):
-        import numpy as np
-        import string
-        rng = kwarray.ensure_rng(rng)
-        demo_param_space = {
-            'param1': list(range(3)),
-            'param2': np.linspace(0, 10, 10),
-            'param3': list(string.ascii_lowercase[0:3]),
-        }
-        params = {k: rng.choice(b) for k, b in demo_param_space.items()}
-        metrics = {
-            'f1': rng.rand(),
-            'acc': rng.rand(),
-        }
-        name = ub.hash_data(params)[0:8]
-        self = cls(name, params, metrics)
-        return self
-
-
-class ResultAnalysis(ub.NiceRepr):
-    """
-    Groups and runs stats on results
-
-    Runs statistical tests on sets of configuration-metrics pairs
-
-    Attributes:
-        results (List[Result]): list of results
-
-        ignore_metrics (Set[str]): metrics to ignore
-
-        ignore_params (Set[str]): parameters to ignore
-
-        metric_objectives (Dict[str, str]):
-            indicate if each metrix should be maximized "max" or minimized
-            "min"
-
-        metrics (List[str]):
-            only consider these metrics
-
-        abalation_orders (Set[int]):
-            The number of parameters to be held constant in each statistical
-            grouping. Defaults to 1, so it groups together results where 1
-            variable is held constant. Including 2 will include pairwise
-            settings of parameters to be held constant. Using -1 or -2 means
-            all but 1 or 2 parameters will be held constant, repsectively.
-
-        default_objective (str):
-            assume max or min for unknown metrics
-
-    Example:
-        >>> self = ResultAnalysis.demo()
-        >>> self.analysis()
-
-    Example:
-        >>> # Given a list of experiments, configs, and results
-        >>> # Create a ResultAnalysis object
-        >>> results = ResultAnalysis([
-        >>>     Result('expt0', {'param1': 2, 'param3': 'b'}, {'f1': 0.75}),
-        >>>     Result('expt1', {'param1': 0, 'param3': 'c'}, {'f1': 0.92}),
-        >>>     Result('expt2', {'param1': 1, 'param3': 'b'}, {'f1': 0.77}),
-        >>>     Result('expt3', {'param1': 1, 'param3': 'a'}, {'f1': 0.67}),
-        >>>     Result('expt4', {'param1': 0, 'param3': 'c'}, {'f1': 0.98}),
-        >>>     Result('expt5', {'param1': 2, 'param3': 'a'}, {'f1': 0.86}),
-        >>>     Result('expt6', {'param1': 1, 'param3': 'c'}, {'f1': 0.77}),
-        >>>     Result('expt7', {'param1': 1, 'param3': 'c'}, {'f1': 0.41}),
-        >>>     Result('expt8', {'param1': 1, 'param3': 'a'}, {'f1': 0.64}),
-        >>>     Result('expt9', {'param1': 0, 'param3': 'b'}, {'f1': 0.95}),
-        >>> ])
-        >>> # Calling the analysis method prints something like the following
-        >>> results.analysis()
-
-        PARAMETER 'param1' - f1
-        =======================
-        f1       mean       std   max   min  num  best
-        param1
-        0       0.950  0.030000  0.98  0.92  3.0  0.98
-        2       0.805  0.077782  0.86  0.75  2.0  0.86
-        1       0.652  0.147377  0.77  0.41  5.0  0.77
-
-        ANOVA hypothesis (roughly): the param 'param1' has no effect on the metric
-            Reject this hypothesis if the p value is less than a threshold
-          Rank-ANOVA: p=0.0397
-          Mean-ANOVA: p=0.0277
-
-        Pairwise T-Tests
-          Is param1=0 about as good as param1=2?
-            ttest_ind:  p=0.2058
-          Is param1=1 about as good as param1=2?
-            ttest_ind:  p=0.1508
-
-
-        PARAMETER 'param3' - f1
-        =======================
-        f1          mean       std   max   min  num  best
-        param3
-        c       0.770000  0.255734  0.98  0.41  4.0  0.98
-        b       0.823333  0.110151  0.95  0.75  3.0  0.95
-        a       0.723333  0.119304  0.86  0.64  3.0  0.86
-
-        ANOVA hypothesis (roughly): the param 'param3' has no effect on the metric
-            Reject this hypothesis if the p value is less than a threshold
-          Rank-ANOVA: p=0.5890
-          Mean-ANOVA: p=0.8145
-
-        Pairwise T-Tests
-          Is param3=b about as good as param3=c?
-            ttest_ind:  p=0.7266
-          Is param3=a about as good as param3=b?
-            ttest_ind:  p=0.3466
-            ttest_rel:  p=0.3466
-          Is param3=a about as good as param3=c?
-            ttest_ind:  p=0.7626
-    """
-
-    def __init__(self, results, metrics=None, ignore_params=None,
-                 ignore_metrics=None, metric_objectives=None,
-                 abalation_orders={1}, default_objective='max'):
-        self.results = results
-        if ignore_metrics is None:
-            ignore_metrics = set()
-        if ignore_params is None:
-            ignore_params = set()
-        self.ignore_params = ignore_params
-        self.ignore_metrics = ignore_metrics
-
-        self.abalation_orders = abalation_orders
-        self.default_objective = default_objective
-
-        # encode if we want to maximize or minimize a metric
-        default_metric_to_objective = {
-            'ap': 'max',
-            'acc': 'max',
-            'f1': 'max',
-            #
-            'loss': 'min',
-            'brier': 'min',
-        }
-        if metric_objectives is None:
-            metric_objectives = {}
-
-        self.metric_objectives = default_metric_to_objective.copy()
-        self.metric_objectives.update(metric_objectives)
-
-        self.metrics = metrics
-        self.statistics = None
-
-        self._description = {}
-        self._description['built'] = False
-        self._description['num_results'] = len(self.results)
-
-    def __nice__(self):
-        # if len(self._description) == 0:
-        #     return 'unbuilt'
-        # else:
-        return ub.repr2(self._description, si=1, sv=1)
-
-    @classmethod
-    def demo(cls, num=10, rng=None):
-        rng = kwarray.ensure_rng(rng)
-        results = [Result.demo(rng=rng) for _ in range(num)]
-        self = cls(results, metrics={'f1', 'acc'})
-        return self
-
-    def run(self):
-        self.build()
-        self.report()
-
-    def analysis(self):
-        # alias for run
-        return self.run()
-        self.build()
-        self.report()
-
-    @ub.memoize_property
-    def table(self):
-        rows = [r.to_dict() for r in self.results]
-        table = pd.DataFrame(rows)
-        return table
-
-    def metric_table(self):
-        rows = [r.to_dict() for r in self.results]
-        table = pd.DataFrame(rows)
-        return table
-
-    @ub.memoize_property
-    def varied(self):
-        config_rows = [r.params for r in self.results]
-        sentinel = object()
-        # pd.DataFrame(config_rows).channels
-        varied = dict(ub.varied_values(config_rows, default=sentinel, min_variations=1))
-        # remove nans
-        varied = {
-            k: {v for v in vs if not (isinstance(v, float) and math.isnan(v))}
-            for k, vs in varied.items()}
-        varied = {k: vs for k, vs in varied.items() if len(vs)}
-        return varied
-
-    def abalation_groups(self, param):
-        """
-        Example:
-            >>> self = ResultAnalysis.demo()
-            >>> param = 'param2'
-            >>> self.abalation_groups(param)
-        """
-        table = self.table
-        config_rows = [r.params for r in self.results]
-        config_keys = list(map(set, config_rows))
-        if self.ignore_params:
-            config_keys = [c - self.ignore_params for c in config_keys]
-        isect_params = set.intersection(*config_keys)
-        other_params = sorted(isect_params - {param})
-        groups = []
-        for key, group in table.groupby(other_params, dropna=False):
-            if len(group) > 1:
-                groups.append(group)
-        return groups
-
-    def abalate_one(self, param):
-        """
-        Example:
-            >>> self = ResultAnalysis.demo()
-            >>> param = 'param2'
-            >>> # xdoctest: +REQUIRES(module:openskill)
-            >>> self.abalate_one(param)
-        """
-        import itertools as it
-        if self.table is None:
-            self.table = self.build_table()
-        param_unique_vals = self.table[param].unique().tolist()
-        score_improvements = ub.ddict(list)
-        scored_obs = []
-        skillboard = SkillTracker(param_unique_vals)
-        groups = self.abalation_groups(param)
-
-        for group in groups:
-            for metric_key in self.metrics:
-                ascending = self._objective_is_ascending(metric_key)
-
-                group = group.sort_values(metric_key, ascending=ascending)
-                subgroups = group.groupby(param)
-                if ascending:
-                    best_idx = subgroups[metric_key].idxmax()
-                else:
-                    best_idx = subgroups[metric_key].idxmin()
-                best_group = group.loc[best_idx]
-                best_group = best_group.sort_values(metric_key, ascending=ascending)
-
-                for x1, x2 in it.product(best_group.index, best_group.index):
-                    if x1 != x2:
-                        r1 = best_group.loc[x1]
-                        r2 = best_group.loc[x2]
-                        k1 = r1[param]
-                        k2 = r2[param]
-                        diff = r1[metric_key] - r2[metric_key]
-                        score_improvements[(k1, k2)].append(diff)
-
-                # metric_vals = best_group[metric_key].values
-                # diffs = metric_vals[None, :] - metric_vals[:, None]
-                best_group.set_index(param)
-                # best_group[param]
-                # best_group[metric_key].diff()
-                scored_ranking = best_group[[param, metric_key]].reset_index(drop=True)
-                scored_obs.append(scored_ranking)
-                skillboard.observe(scored_ranking[param])
-
-        print('skillboard.ratings = {}'.format(ub.repr2(skillboard.ratings, nl=1, align=':')))
-        win_probs = skillboard.predict_win()
-        print(f'win_probs={win_probs}')
-        for key, improves in score_improvements.items():
-            k1, k2 = key
-            improves = np.array(improves)
-            pos_delta = improves[improves > 0]
-            print(f'\nWhen {param}={k1} is better than {param}={k2}')
-            print(pd.DataFrame([pd.Series(pos_delta).describe().T]))
-        return scored_obs
-        # self.varied[param]
-
-    def _objective_is_ascending(self, metric_key):
-        """
-        Return True if we should minimize the objective (lower is better)
-        Return False if we should maximize the objective (higher is better)
-        """
-        objective = self.metric_objectives.get(metric_key, None)
-        if objective is None:
-            warnings.warn(f'warning assume {self.default_objective} for {metric_key=}')
-            objective = self.default_objective
-        ascending = (objective == 'min')
-        return ascending
-
-    def test_group(self, param_group, metric_key):
-        """
-        Get stats for a particular metric / constant group
-
-        Args:
-            param_group (List[str]): group of parameters to hold constant.
-            metric_key (str): The metric to test.
-
-        Returns:
-            dict
-            # TODO : document these stats clearly and accurately
-
-        Example:
-            >>> self = ResultAnalysis.demo(num=30)
-            >>> print(self.table)
-            >>> param_group = ['param2']
-            >>> metric_key = 'f1'
-            >>> stats_row = self.test_group(param_group, metric_key)
-            >>> print('stats_row = {}'.format(ub.repr2(stats_row, nl=2, precision=2)))
-            >>> # ---
-            >>> self.build()
-            >>> self.report()
-        """
-        param_group_name = ','.join(param_group)
-        stats_row = {
-            'param_name': param_group_name,
-            'metric': metric_key,
-        }
-        # param_values = varied[param_name]
-        # stats_row['param_values'] = param_values
-        ascending = self._objective_is_ascending(metric_key)
-
-        # Find all items with this particular param value
-        value_to_metric_group = {}
-        value_to_metric_stats = {}
-        value_to_metric = {}
-
-        varied_cols = sorted(self.varied.keys())
-
-        # Not sure if this is the right name, these are the other param keys
-        # that we are not directly investigating, but might have an impact.
-        # We use these to select comparable rows for pairwise t-tests
-        nuisance_cols = sorted(set(self.varied.keys()) - set(param_group))
-
-        for param_value, group in self.table.groupby(param_group):
-            metric_group = group[['name', metric_key] + varied_cols]
-            metric_vals = metric_group[metric_key]
-            metric_vals = metric_vals.dropna()
-            if len(metric_vals) > 0:
-                metric_stats = metric_vals.describe()
-                value_to_metric_stats[param_value] = metric_stats
-                value_to_metric_group[param_value] = metric_group
-                value_to_metric[param_value] = metric_vals.values
-
-        moments = pd.DataFrame(value_to_metric_stats).T
-        moments = moments.sort_values('mean', ascending=ascending)
-        moments.index.name = param_group_name
-        moments.columns.name = metric_key
-        ranking = moments['mean'].index.to_list()
-        param_to_rank = ub.invert_dict(dict(enumerate(ranking)))
-
-        # Determine a set of value pairs to do pairwise comparisons on
-        value_pairs = ub.oset()
-        value_pairs.update(map(frozenset, ub.iter_window(moments.index, 2)))
-        value_pairs.update(map(frozenset, ub.iter_window(moments.sort_values('mean', ascending=ascending).index, 2)))
-
-        # https://en.wikipedia.org/wiki/Kruskal%E2%80%93Wallis_one-way_analysis_of_variance
-        # If the researcher can make the assumptions of an identically
-        # shaped and scaled distribution for all groups, except for any
-        # difference in medians, then the null hypothesis is that the
-        # medians of all groups are equal, and the alternative
-        # hypothesis is that at least one population median of one
-        # group is different from the population median of at least one
-        # other group.
-        try:
-            anova_krus_result = scipy.stats.kruskal(*value_to_metric.values())
-        except ValueError:
-            anova_krus_result = scipy.stats.stats.KruskalResult(np.nan, np.nan)
-
-        # https://en.wikipedia.org/wiki/One-way_analysis_of_variance
-        # The One-Way ANOVA tests the null hypothesis, which states
-        # that samples in all groups are drawn from populations with
-        # the same mean values
-        if len(value_to_metric) > 1:
-            anova_1way_result = scipy.stats.f_oneway(*value_to_metric.values())
-        else:
-            anova_1way_result = scipy.stats.stats.F_onewayResult(np.nan, np.nan)
-
-        stats_row['anova_rank_H'] = anova_krus_result.statistic
-        stats_row['anova_rank_p'] = anova_krus_result.pvalue
-        stats_row['anova_mean_F'] = anova_1way_result.statistic
-        stats_row['anova_mean_p'] = anova_1way_result.pvalue
-        stats_row['moments'] = moments
-
-        pairwise_statistics = []
-        for pair in value_pairs:
-            pair_statistics = {}
-            # try:
-            #     param_val1, param_val2 = sorted(pair)
-            # except Exception:
-            #     param_val1, param_val2 = (pair)
-            param_val1, param_val2 = pair
-
-            metric_vals1 = value_to_metric[param_val1]
-            metric_vals2 = value_to_metric[param_val2]
-
-            rank1 = param_to_rank[param_val1]
-            rank2 = param_to_rank[param_val2]
-            pair_statistics['winner'] = param_val1 if rank1 < rank2 else param_val2
-            pair_statistics['value1'] = param_val1
-            pair_statistics['value2'] = param_val2
-            pair_statistics['n1'] = len(metric_vals1)
-            pair_statistics['n2'] = len(metric_vals2)
-            ttest_ind_result = scipy.stats.ttest_ind(metric_vals1, metric_vals2, equal_var=False)
-            pair_statistics['ttest_ind'] = ttest_ind_result
-
-            # Do relative checks, need to find comparable subgroups
-            metric_group1 = value_to_metric_group[param_val1]
-            metric_group2 = value_to_metric_group[param_val2]
-            nuisance_vals1 = metric_group1[nuisance_cols]
-            nuisance_vals2 = metric_group2[nuisance_cols]
-            nk_to_group1 = dict(list(nuisance_vals1.groupby(nuisance_cols)))
-            nk_to_group2 = dict(list(nuisance_vals2.groupby(nuisance_cols)))
-            common = set(nk_to_group1) & set(nk_to_group2)
-            comparable_indexes1 = []
-            comparable_indexes2 = []
-            if common:
-                for nk in common:
-                    group1 = nk_to_group1[nk]
-                    group2 = nk_to_group2[nk]
-                    for i, j in it.product(group1.index, group2.index):
-                        comparable_indexes1.append(i)
-                        comparable_indexes2.append(j)
-
-                comparable_groups1 = metric_group1.loc[comparable_indexes1, metric_key]
-                comparable_groups2 = metric_group2.loc[comparable_indexes2, metric_key]
-
-                # Does this need to have the values aligned?
-                ttest_rel_result = scipy.stats.ttest_rel(comparable_groups1, comparable_groups2)
-                pair_statistics['ttest_rel'] = ttest_rel_result
-            pairwise_statistics.append(pair_statistics)
-
-        stats_row['pairwise'] = pairwise_statistics
-        return stats_row
-
-    def build(self):
-        import itertools as it
-        if len(self.results) < 2:
-            raise Exception('need at least 2 results')
-
-        varied = self.varied.copy()
-        if self.ignore_params:
-            for k in self.ignore_params:
-                varied.pop(k, None)
-
-        # Experimental:
-        # Find Auto-abalation groups
-        # TODO: when the group size is -1, instead of showing all of the group
-        # settings, for each group setting do the k=1 analysis within that group
-        varied_param_names = list(varied.keys())
-        num_varied_params = len(varied)
-        held_constant_orders = {num_varied_params + i if i < 0 else i for i in self.abalation_orders}
-        held_constant_orders = [i for i in held_constant_orders if i > 0]
-        held_constant_groups = []
-        for k in held_constant_orders:
-            held_constant_groups.extend(
-                list(map(list, it.combinations(varied_param_names, k))))
-
-        if self.metrics is None:
-            avail_metrics = set.intersection(*[set(r.metrics.keys()) for r in self.results])
-            metrics_of_interest = sorted(avail_metrics - set(self.ignore_metrics))
-        else:
-            metrics_of_interest = self.metrics
-        self.metrics_of_interest = metrics_of_interest
-        self._description['metrics_of_interest'] = metrics_of_interest
-        self._description['num_groups'] = len(held_constant_groups)
-
-        # Analyze the impact of each parameter
-        self.statistics = statistics = []
-        for param_group in held_constant_groups:
-            for metric_key in metrics_of_interest:
-                stats_row = self.test_group(param_group, metric_key)
-                statistics.append(stats_row)
-
-        self.stats_table = pd.DataFrame([
-            ub.dict_diff(d, {'pairwise', 'param_values', 'moments'})
-            for d in self.statistics])
-
-        if len(self.stats_table):
-            self.stats_table = self.stats_table.sort_values('anova_rank_p')
-
-        self._description['built'] = True
-
-    def report(self):
-        p_threshold = 0.05
-        stat_groups = ub.group_items(self.statistics, key=lambda x: x['param_name'])
-        stat_groups_items = list(stat_groups.items())
-
-        # Modify this order to change the grouping pattern
-        grid = ub.named_product({
-            'stat_group_item': stat_groups_items,
-            'metrics': self.metrics_of_interest,
+    dpath = ub.Path.appdir('ujson/benchmark_results').ensuredir()
+    benchmark.dump_in_dpath(dpath)
+
+    RECORD_ALL = 0
+    metric_key = "time" if RECORD_ALL else "mean"
+
+    from benchmarker import result_analysis
+    results = benchmark.result.to_result_list()
+    analysis = result_analysis.ResultAnalysis(
+        results,
+        metrics=[metric_key],
+        params=['impl'],
+        metric_objectives={
+            'min': 'min',
+            'mean': 'min',
+            'time': 'min',
         })
-        for grid_item in grid:
-            metric_key = grid_item['metrics']
-            stat_groups_item = grid_item['stat_group_item']
+    analysis.analysis()
 
-            param_name, stat_group = stat_groups_item
-            stats_row = ub.group_items(stat_group, key=lambda x: x['metric'])[metric_key][0]
-            title = ('PARAMETER {!r} - {}'.format(param_name, metric_key))
-            print('\n\n')
-            print(title)
-            print('=' * len(title))
-            print(stats_row['moments'])
-            anova_rank_p = stats_row['anova_rank_p']
-            anova_mean_p = stats_row['anova_mean_p']
-            # Rougly speaking
-            print('')
-            print(f'ANOVA: If p is low, the param {param_name!r} might have an effect')
-            print(ub.color_text(f'  Rank-ANOVA: p={anova_rank_p:0.8f}', 'green' if anova_rank_p < p_threshold else None))
-            print(ub.color_text(f'  Mean-ANOVA: p={anova_mean_p:0.8f}', 'green' if anova_mean_p < p_threshold else None))
-            print('')
-            print('Pairwise T-Tests')
-            for pairstat in stats_row['pairwise']:
-                # Is this backwards?
-                value1 = pairstat['value1']
-                value2 = pairstat['value2']
-                winner = pairstat['winner']
-                if value2 == winner:
-                    value1, value2 = value2, value1
-                print(f'  If p is low, {param_name}={value1} may outperform {param_name}={value2}.')
-                if 'ttest_ind' in pairstat:
-                    ttest_ind_result = pairstat['ttest_ind']
-                    print(ub.color_text(f'    ttest_ind:  p={ttest_ind_result.pvalue:0.8f}', 'green' if ttest_ind_result.pvalue < p_threshold else None))
-                if 'ttest_rel' in pairstat:
-                    ttest_rel_result = pairstat['ttest_ind']
-                    print(ub.color_text(f'    ttest_rel:  p={ttest_rel_result.pvalue:0.8f}', 'green' if ttest_rel_result.pvalue < p_threshold else None))
-
-        print(self.stats_table)
-
-    def conclusions(self):
-        conclusions = []
-        for stat in self.statistics:
-            param_name = stat['param_name']
-            metric = stat['metric']
-            for pairstat in stat['pairwise']:
-                value1 = pairstat['value1']
-                value2 = pairstat['value2']
-                winner = pairstat['winner']
-                if value2 == winner:
-                    value1, value2 = value2, value1
-                pvalue = stat = pairstat['ttest_ind'].pvalue
-                txt = (f'p={pvalue:0.8f}, If p is low, {param_name}={value1} may outperform {value2} on {metric}.')
-                conclusions.append(txt)
-        return conclusions
-
-
-class SkillTracker:
-    """
-    Wrapper around openskill
-
-    Args:
-        player_ids (List[T]):
-            a list of ids (usually ints) used to represent each player
-
-    Example:
-        >>> # xdoctest: +REQUIRES(module:openskill)
-        >>> self = SkillTracker([1, 2, 3, 4, 5])
-        >>> self.observe([2, 3])  # Player 2 beat player 3.
-        >>> self.observe([1, 2, 5, 3])  # Player 3 didnt play this round.
-        >>> self.observe([2, 3, 4, 5, 1])  # Everyone played, player 2 won.
-        >>> win_probs = self.predict_win()
-        >>> print('win_probs = {}'.format(ub.repr2(win_probs, nl=1, precision=2)))
-        win_probs = {
-            1: 0.20,
-            2: 0.21,
-            3: 0.19,
-            4: 0.20,
-            5: 0.20,
-        }
-    """
-
-    def __init__(self, player_ids):
-        import openskill
-        self.player_ids = player_ids
-        self.ratings = {m: openskill.Rating() for m in player_ids}
-        self.observations = []
-
-    def predict_win(self):
-        """
-        Estimate the probability that a particular player will win given the
-        current ratings.
-
-        Returns:
-            Dict[T, float]: mapping from player ids to win probabilites
-        """
-        from openskill import predict_win
-        teams = [[p] for p in list(self.ratings.keys())]
-        ratings = [[r] for r in self.ratings.values()]
-        probs = predict_win(ratings)
-        win_probs = {team[0]: prob for team, prob in zip(teams, probs)}
-        return win_probs
-
-    def observe(self, ranking):
-        """
-        After simulating a round, pass the ranked order of who won
-        (winner is first, looser is last) to this function. And it
-        updates the rankings.
-
-        Args:
-            ranking (List[T]):
-                ranking of all the players that played in this round
-                winners are at the front (0-th place) of the list.
-        """
-        import openskill
-        self.observations.append(ranking)
-        ratings = self.ratings
-        team_standings = [[r] for r in ub.take(ratings, ranking)]
-        new_values = openskill.rate(team_standings)  # Not inplace
-        new_ratings = [openskill.Rating(*new[0]) for new in new_values]
-        ratings.update(ub.dzip(ranking, new_ratings))
+    # benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL)
 
 
 if __name__ == "__main__":
diff --git a/tests/benchmarker/__init__.py b/tests/benchmarker/__init__.py
new file mode 100644
index 0000000..1d04095
--- /dev/null
+++ b/tests/benchmarker/__init__.py
@@ -0,0 +1,35 @@
+"""
+A helper module for executing, serializing, combining, and comparing benchmarks
+"""
+
+__mkinit__ = """
+# Autogenerate this file
+mkinit ~/code/ultrajson/tests/benchmarker/__init__.py -w
+"""
+
+__version__ = '0.1.0'
+
+from benchmarker import aggregate
+from benchmarker import benchmarker
+from benchmarker import process_context
+from benchmarker import result_analysis
+from benchmarker import util_json
+from benchmarker import visualize
+
+from benchmarker.aggregate import (demo, demo_data,)
+from benchmarker.benchmarker import (Benchmarker, BenchmarkerConfig,
+                                     BenchmarkerResult, combine_stats,
+                                     stats_dict,)
+from benchmarker.process_context import (ProcessContext,)
+from benchmarker.result_analysis import (Result, ResultAnalysis, SkillTracker,)
+from benchmarker.util_json import (ensure_json_serializable,
+                                   find_json_unserializable,
+                                   indexable_allclose,)
+from benchmarker.visualize import (benchmark_analysis,)
+
+__all__ = ['Benchmarker', 'BenchmarkerConfig', 'BenchmarkerResult',
+           'ProcessContext', 'Result', 'ResultAnalysis', 'SkillTracker',
+           'aggregate', 'benchmark_analysis', 'benchmarker', 'combine_stats',
+           'demo', 'demo_data', 'ensure_json_serializable',
+           'find_json_unserializable', 'indexable_allclose', 'process_context',
+           'result_analysis', 'stats_dict', 'util_json', 'visualize']
diff --git a/tests/benchmarker/_test_ttest.py b/tests/benchmarker/_test_ttest.py
new file mode 100644
index 0000000..4e83a5d
--- /dev/null
+++ b/tests/benchmarker/_test_ttest.py
@@ -0,0 +1,28 @@
+
+def check_ttest():
+    import scipy
+    import scipy.stats  # NOQA
+    from benchmarker.benchmarker import stats_dict
+    import numpy as np
+    metric_vals1 = np.random.randn(10000) + 0.01
+    metric_vals2 = np.random.randn(1000)
+
+    stats1 = stats_dict(metric_vals1)
+    stats2 = stats_dict(metric_vals2)
+
+    ind_kw = dict(
+        equal_var=0,
+        # alternative='two-sided'
+        alternative='less' if stats1['mean'] < stats2['mean'] else 'greater'
+    )
+
+    # Not sure why these are slightly different
+    res1 = scipy.stats.ttest_ind(metric_vals1, metric_vals2, **ind_kw)
+
+    res2 = scipy.stats.ttest_ind_from_stats(
+        stats1['mean'], stats1['std'], stats1['n'],
+        stats2['mean'], stats2['std'], stats2['n'],
+        **ind_kw
+    )
+    print('res1 = {!r}'.format(res1))
+    print('res2 = {!r}'.format(res2))
diff --git a/tests/benchmarker/aggregate.py b/tests/benchmarker/aggregate.py
new file mode 100644
index 0000000..41d11e8
--- /dev/null
+++ b/tests/benchmarker/aggregate.py
@@ -0,0 +1,68 @@
+import json
+import pandas as pd
+import ubelt as ub
+
+
+def demo_data():
+    from benchmarker.benchmarker import Benchmarker
+    import numpy as np
+    impl_lut = {
+        'numpy': np.sum,
+        'builtin': sum,
+    }
+    def data_lut(params):
+        item = 42 if params['dtype'] == 'int' else 42.0
+        data = [item] * params['size']
+        return data
+    basis = {
+        'impl': ['builtin', 'numpy'],
+        'size': [10, 10000],
+        'dtype': ['int', 'float'],
+    }
+
+    dpath = ub.Path.appdir('benchmarker/agg_demo').delete().ensuredir()
+
+    def run_one_benchmark():
+        self = Benchmarker(name='agg_demo', num=10, bestof=3, basis=basis)
+        for params in self.iter_params():
+            impl = impl_lut[params['impl']]
+            data = data_lut(params)
+            for timer in self.measure():
+                with timer:
+                    impl(data)
+        fpath = self.dump_in_dpath(dpath)
+        return fpath
+
+    # Run the benchmark multiple times
+    fpaths = []
+    for _ in range(5):
+        fpath = run_one_benchmark()
+        fpaths.append(fpath)
+
+    return fpaths
+
+
+def demo():
+    from benchmarker import BenchmarkerResult
+    from benchmarker import result_analysis
+    fpaths = demo_data()
+
+    results = []
+    for fpath in fpaths:
+        data = json.loads(fpath.read_text())
+        for row in data['rows']:
+            result = BenchmarkerResult.load(fpath)
+            results.extend(result.to_result_list())
+
+    analysis = result_analysis.ResultAnalysis(
+        results,
+        metrics=['min', 'mean'],
+        params=['impl'],
+        metric_objectives={
+            'min': 'min',
+            'mean': 'min',
+        })
+    analysis.analysis()
+    # single_df = pd.DataFrame(data['rows'])
+    # context = data['context']
+    # single_df
diff --git a/tests/benchmarker/benchmarker.py b/tests/benchmarker/benchmarker.py
new file mode 100644
index 0000000..b488fb3
--- /dev/null
+++ b/tests/benchmarker/benchmarker.py
@@ -0,0 +1,230 @@
+import json
+import timerit
+import ubelt as ub
+import numpy as np
+from dataclasses import dataclass
+from benchmarker.process_context import ProcessContext
+
+
+@dataclass
+class BenchmarkerConfig:
+    name   : str = None
+    num    : int = 100
+    bestof : int = 10
+
+
+class BenchmarkerResult:
+    """
+    Serialization for a single benchmark result
+    """
+    def __init__(self, context, rows):
+        self.context = context
+        self.rows = rows
+
+    def __json__(self):
+        data = {
+            'type': 'benchmark_result',
+            'context': self.context,
+            'rows': self.rows,
+        }
+        return data
+
+    @classmethod
+    def from_json(cls, data):
+        assert data['type'] == 'benchmark_result'
+        self = cls(data['context'], data['rows'])
+        return self
+
+    @classmethod
+    def load(cls, fpath):
+        with open(fpath, 'r') as file:
+            data = json.load(file)
+        self = cls.from_json(data)
+        return self
+
+    def to_result_list(self):
+        """
+        Returns a list of result objects suitable for ResultAnalysis
+
+        Returns:
+            List[Result]
+        """
+        from benchmarker import result_analysis
+        results = []
+        for row in self.rows:
+            result = result_analysis.Result(
+                name=row['name'],
+                metrics=row['metrics'],
+                params=row['params'].copy(),
+            )
+            machine = self.context['machine']
+            assert not ub.dict_isect(result.params, machine)
+            result.params.update(machine)
+            results.append(result)
+        return results
+
+
+class Benchmarker:
+    """
+    Helper to organize the execution and serialization of a benchmark
+
+    Example:
+        >>> import sys, ubelt
+        >>> sys.path.append(ubelt.expandpath('~/code/ultrajson/tests'))
+        >>> from benchmarker.benchmarker import *  # NOQA
+        >>> import numpy as np
+        >>> impl_lut = {
+        >>>     'numpy': np.sum,
+        >>>     'builtin': sum,
+        >>> }
+        >>> def data_lut(params):
+        >>>     item = 42 if params['dtype'] == 'int' else 42.0
+        >>>     data = [item] * params['size']
+        >>>     return data
+        >>> basis = {
+        >>>     'impl': ['builtin', 'numpy'],
+        >>>     'size': [10, 10000],
+        >>>     'dtype': ['int', 'float'],
+        >>> }
+        >>> self = Benchmarker(name='demo', num=10, bestof=3, basis=basis)
+        >>> for params in self.iter_params():
+        >>>     impl = impl_lut[params['impl']]
+        >>>     data = data_lut(params)
+        >>>     for timer in self.measure():
+        >>>         with timer:
+        >>>             impl(data)
+        >>> print('self.result = {}'.format(ub.repr2(self.result.__json__(), sort=0, nl=2, precision=8)))
+        >>> dpath = ub.Path.appdir('benchmarker/demo').ensuredir()
+        >>> self.dump_in_dpath(dpath)
+    """
+    def __init__(self, basis={}, **kwargs):
+        self.basis = basis
+
+        self.config = BenchmarkerConfig(**kwargs)
+
+        self.ti = timerit.Timerit(
+            num=self.config.num,
+            bestof=self.config.bestof)
+        self.context = ProcessContext(name=self.config.name)
+        self.rows = []
+        self.RECORD_ALL = 0
+        self.result = None
+
+    def dump_in_dpath(self, dpath):
+        dpath = ub.Path(dpath)
+        timestamp = self.context.obj['stop_timestamp']
+        fname = f'benchmarks_{self.config.name}_{timestamp}.json'
+        fpath = dpath / fname
+
+        with open(fpath, 'w') as file:
+            json.dump(self.result.__json__(), file)
+        return fpath
+
+    def iter_params(self):
+        self.context.start()
+        grid_iter = list(ub.named_product(self.basis))
+        for params in grid_iter:
+            self.params = params
+            self.key = ub.repr2(params, compact=1, si=1)
+            yield params
+        obj = self.context.stop()
+        self.result = BenchmarkerResult(obj, self.rows)
+
+    def measure(self):
+        for timer in self.ti.reset(self.key):
+            yield timer
+
+        rows = self.rows
+        ti = self.ti
+        key = self.key
+        params = self.params
+        times = ti.robust_times()
+        if self.RECORD_ALL:
+            for time in times:
+                metrics = {
+                    "time": time,
+                }
+                row = {
+                    'name': key,
+                    'metrics': metrics,
+                    'params': params,
+                }
+                rows.append(row)
+        else:
+            times = np.array(ti.robust_times())
+            metrics = stats_dict(times)
+            row = {
+                'metrics': metrics,
+                'params': params,
+                'name': key,
+            }
+            rows.append(row)
+
+
+def stats_dict(data):
+    stats = {
+        'n': len(data),
+        'mean': data.mean(),
+        'std': data.std(),
+        'min': data.min(),
+        'max': data.max(),
+    }
+    return stats
+
+
+def combine_stats(s1, s2):
+    """
+    Helper for combining mean and standard deviation of multiple measurements
+
+    Args:
+        s1 (dict): stats dict containing mean, std, and n
+        s2 (dict): stats dict containing mean, std, and n
+
+    Example:
+        >>> basis = {
+        >>>     'n1': [1, 10, 100, 10000],
+        >>>     'n2': [1, 10, 100, 10000],
+        >>> }
+        >>> for params in ub.named_product(basis):
+        >>>     data1 = np.random.rand(params['n1'])
+        >>>     data2 = np.random.rand(params['n2'])
+        >>>     data3 = np.hstack([data1, data2])
+        >>>     s1 = stats_dict(data1)
+        >>>     s2 = stats_dict(data2)
+        >>>     s3 = stats_dict(data3)
+        >>>     # Check that our combo works
+        >>>     combo_s3 = combine_stats(s1, s2)
+        >>>     compare = pd.DataFrame({'raw': s3, 'combo': combo_s3})
+        >>>     print(compare)
+        >>>     assert np.allclose(compare.raw, compare.combo)
+
+    References:
+        https://stackoverflow.com/questions/7753002/adding-combining-standard-deviations
+        https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups
+    """
+    stats = [s1, s2]
+    sizes = np.array([s['n'] for s in stats])
+    means = np.array([s['mean'] for s in stats])
+    stds = np.array([s['std'] for s in stats])
+    mins = np.array([s['min'] for s in stats])
+    maxs = np.array([s['max'] for s in stats])
+    varis = stds * stds
+
+    combo_size = sizes.sum()
+    combo_mean = (sizes * means).sum() / combo_size
+
+    mean_deltas = (means - combo_mean)
+
+    sv = (sizes * varis).sum()
+    sm = (sizes * (mean_deltas * mean_deltas)).sum()
+    combo_vars = (sv + sm) / combo_size
+    combo_std = np.sqrt(combo_vars)
+
+    combo_stats = {
+        'n': combo_size,
+        'mean': combo_mean,
+        'std': combo_std,
+        'min': mins.min(),
+        'max': maxs.max(),
+    }
+    return combo_stats
diff --git a/tests/benchmarker/process_context.py b/tests/benchmarker/process_context.py
new file mode 100644
index 0000000..e198f9c
--- /dev/null
+++ b/tests/benchmarker/process_context.py
@@ -0,0 +1,103 @@
+import ubelt as ub
+import socket
+import platform
+import sys
+
+
+class ProcessContext:
+    """
+    Context manager to track the context under which a result was computed
+
+    Example:
+        >>> import sys, ubelt
+        >>> sys.path.append(ubelt.expandpath('~/code/ultrajson/tests'))
+        >>> from benchmarker.process_context import *  # NOQA
+        >>> self = ProcessContext()
+        >>> obj = self.start().stop()
+    """
+
+    def __init__(self, name=None, args=None, config=None):
+        if args is None:
+            args = sys.argv
+
+        self.obj = {
+            'type': 'process_context',
+            'name': name,
+            'args': args,
+            'config': config,
+            'machine': None,
+            'start_timestamp': None,
+            'stop_timestamp': None,
+        }
+
+    def _timestamp(self):
+        import datetime
+        timestamp = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat()
+        timestamp = timestamp.replace(':', '')
+        # timestamp = ub.timestamp()
+        return timestamp
+
+    def _hostinfo(self):
+        return {
+            'host': socket.gethostname(),
+            'user': ub.Path(ub.userhome()).name,
+            # 'cwd': os.getcwd(),
+        }
+
+    def _osinfo(self):
+        uname_system, _, uname_release, uname_version, _, uname_processor = platform.uname()
+        return {
+            'os_name': uname_system,
+            'os_release': uname_release,
+            'os_version': uname_version,
+            'arch': uname_processor,
+        }
+
+    def _pyinfo(self):
+        return {
+            'py_impl': platform.python_implementation(),
+            'py_version': sys.version.replace("\n", ""),
+        }
+
+    def _meminfo(self):
+        import psutil
+        # TODO: could collect memory info at start and stop and intermediate
+        # stages.  Here we just want info that is static wrt to the machine.
+        # For now, just get the total available.
+        svmem_info = psutil.virtual_memory()
+        return {
+            'mem_total': svmem_info.total,
+        }
+
+    # def _cpuinfo(self):
+    #     import cpuinfo
+    #     cpu_info = cpuinfo.get_cpu_info()
+    #     return cpu_info
+
+    def _machine(self):
+        return ub.dict_union(
+            self._hostinfo(),
+            self._meminfo(),
+            self._osinfo(),
+            self._pyinfo(),
+        )
+
+    def start(self):
+        self.obj.update({
+            'machine': self._machine(),
+            'start_timestamp': self._timestamp(),
+            'stop_timestamp': None,
+        })
+        return self
+
+    def stop(self):
+        self.obj.update({
+            'stop_timestamp': self._timestamp(),
+        })
+        return self.obj
+
+    def __enter__(self):
+        return self.start()
+
+    def __exit__(self, a, b, c):
+        self.stop()
diff --git a/tests/benchmarker/result_analysis.py b/tests/benchmarker/result_analysis.py
new file mode 100644
index 0000000..1067b3e
--- /dev/null
+++ b/tests/benchmarker/result_analysis.py
@@ -0,0 +1,722 @@
+import itertools as it
+import math
+import numpy as np
+import pandas as pd
+import ubelt as ub
+import warnings
+import scipy
+import scipy.stats  # NOQA
+
+
+class Result(ub.NiceRepr):
+    """
+    Storage of names, parameters, and quality metrics for a single experiment.
+
+    Attributes:
+        name (str | None):
+            Name of the experiment. Optional. This is unused in the analysis.
+            (i.e. names will never be used computationally. Use them for keys)
+
+        params (Dict[str, object]): configuration of the experiment.
+            This is a dictionary mapping a parameter name to its value.
+
+        metrics (Dict[str, float]): quantitative results of the experiment
+            This is a dictionary for each quality metric computed on this
+            result.
+
+        meta (Dict | None): any other metadata about this result.
+            This is unused in the analysis.
+
+    Example:
+        >>> self = Result.demo(rng=32)
+        >>> print('self = {}'.format(self))
+        self = <Result(name=53f57161,f1=0.33,acc=0.75,param1=1,param2=6.67,param3=a)>
+    """
+    def __init__(self, name, params, metrics, meta=None):
+        self.name = name
+        self.params = params
+        self.metrics = metrics
+        self.meta = meta
+
+    def to_dict(self):
+        row = ub.dict_union({'name': self.name}, self.metrics, self.params)
+        return row
+
+    def __nice__(self):
+        row = self.to_dict()
+        text = ub.repr2(row, compact=True, precision=2, sort=0)
+        return text
+
+    @classmethod
+    def demo(cls, rng=None):
+        import numpy as np
+        import string
+        import kwarray
+        rng = kwarray.ensure_rng(rng)
+        demo_param_space = {
+            'param1': list(range(3)),
+            'param2': np.linspace(0, 10, 10),
+            'param3': list(string.ascii_lowercase[0:3]),
+        }
+        params = {k: rng.choice(b) for k, b in demo_param_space.items()}
+        metrics = {
+            'f1': rng.rand(),
+            'acc': rng.rand(),
+        }
+        name = ub.hash_data(params)[0:8]
+        self = cls(name, params, metrics)
+        return self
+
+
+class ResultAnalysis(ub.NiceRepr):
+    """
+    Groups and runs stats on results
+
+    Runs statistical tests on sets of configuration-metrics pairs
+
+    Attributes:
+        results (List[Result]): list of results
+
+        ignore_metrics (Set[str]): metrics to ignore
+
+        ignore_params (Set[str]): parameters to ignore
+
+        metric_objectives (Dict[str, str]):
+            indicate if each metrix should be maximized "max" or minimized
+            "min"
+
+        metrics (List[str]):
+            only consider these metrics
+
+        params (List[str]):
+            if given, only consider these params
+
+        abalation_orders (Set[int]):
+            The number of parameters to be held constant in each statistical
+            grouping. Defaults to 1, so it groups together results where 1
+            variable is held constant. Including 2 will include pairwise
+            settings of parameters to be held constant. Using -1 or -2 means
+            all but 1 or 2 parameters will be held constant, repsectively.
+
+        default_objective (str):
+            assume max or min for unknown metrics
+
+    Example:
+        >>> self = ResultAnalysis.demo()
+        >>> self.analysis()
+
+    Example:
+        >>> # Given a list of experiments, configs, and results
+        >>> # Create a ResultAnalysis object
+        >>> results = ResultAnalysis([
+        >>>     Result('expt0', {'param1': 2, 'param3': 'b'}, {'f1': 0.75}),
+        >>>     Result('expt1', {'param1': 0, 'param3': 'c'}, {'f1': 0.92}),
+        >>>     Result('expt2', {'param1': 1, 'param3': 'b'}, {'f1': 0.77}),
+        >>>     Result('expt3', {'param1': 1, 'param3': 'a'}, {'f1': 0.67}),
+        >>>     Result('expt4', {'param1': 0, 'param3': 'c'}, {'f1': 0.98}),
+        >>>     Result('expt5', {'param1': 2, 'param3': 'a'}, {'f1': 0.86}),
+        >>>     Result('expt6', {'param1': 1, 'param3': 'c'}, {'f1': 0.77}),
+        >>>     Result('expt7', {'param1': 1, 'param3': 'c'}, {'f1': 0.41}),
+        >>>     Result('expt8', {'param1': 1, 'param3': 'a'}, {'f1': 0.64}),
+        >>>     Result('expt9', {'param1': 0, 'param3': 'b'}, {'f1': 0.95}),
+        >>> ])
+        >>> # Calling the analysis method prints something like the following
+        >>> results.analysis()
+
+        PARAMETER 'param1' - f1
+        =======================
+        f1       mean       std   max   min  num  best
+        param1
+        0       0.950  0.030000  0.98  0.92  3.0  0.98
+        2       0.805  0.077782  0.86  0.75  2.0  0.86
+        1       0.652  0.147377  0.77  0.41  5.0  0.77
+
+        ANOVA hypothesis (roughly): the param 'param1' has no effect on the metric
+            Reject this hypothesis if the p value is less than a threshold
+          Rank-ANOVA: p=0.0397
+          Mean-ANOVA: p=0.0277
+
+        Pairwise T-Tests
+          Is param1=0 about as good as param1=2?
+            ttest_ind:  p=0.2058
+          Is param1=1 about as good as param1=2?
+            ttest_ind:  p=0.1508
+
+
+        PARAMETER 'param3' - f1
+        =======================
+        f1          mean       std   max   min  num  best
+        param3
+        c       0.770000  0.255734  0.98  0.41  4.0  0.98
+        b       0.823333  0.110151  0.95  0.75  3.0  0.95
+        a       0.723333  0.119304  0.86  0.64  3.0  0.86
+
+        ANOVA hypothesis (roughly): the param 'param3' has no effect on the metric
+            Reject this hypothesis if the p value is less than a threshold
+          Rank-ANOVA: p=0.5890
+          Mean-ANOVA: p=0.8145
+
+        Pairwise T-Tests
+          Is param3=b about as good as param3=c?
+            ttest_ind:  p=0.7266
+          Is param3=a about as good as param3=b?
+            ttest_ind:  p=0.3466
+            ttest_rel:  p=0.3466
+          Is param3=a about as good as param3=c?
+            ttest_ind:  p=0.7626
+    """
+
+    def __init__(self, results, metrics=None, params=None, ignore_params=None,
+                 ignore_metrics=None, metric_objectives=None,
+                 abalation_orders={1}, default_objective='max'):
+        self.results = results
+        if ignore_metrics is None:
+            ignore_metrics = set()
+        if ignore_params is None:
+            ignore_params = set()
+        self.ignore_params = ignore_params
+        self.ignore_metrics = ignore_metrics
+
+        self.abalation_orders = abalation_orders
+        self.default_objective = default_objective
+
+        # encode if we want to maximize or minimize a metric
+        default_metric_to_objective = {
+            'ap': 'max',
+            'acc': 'max',
+            'f1': 'max',
+            #
+            'loss': 'min',
+            'brier': 'min',
+        }
+        if metric_objectives is None:
+            metric_objectives = {}
+
+        self.metric_objectives = default_metric_to_objective.copy()
+        self.metric_objectives.update(metric_objectives)
+
+        self.params = params
+        self.metrics = metrics
+        self.statistics = None
+
+        self._description = {}
+        self._description['built'] = False
+        self._description['num_results'] = len(self.results)
+
+    def __nice__(self):
+        # if len(self._description) == 0:
+        #     return 'unbuilt'
+        # else:
+        return ub.repr2(self._description, si=1, sv=1)
+
+    @classmethod
+    def demo(cls, num=10, rng=None):
+        import kwarray
+        rng = kwarray.ensure_rng(rng)
+        results = [Result.demo(rng=rng) for _ in range(num)]
+        self = cls(results, metrics={'f1', 'acc'})
+        return self
+
+    def run(self):
+        self.build()
+        self.report()
+
+    def analysis(self):
+        # alias for run
+        return self.run()
+        self.build()
+        self.report()
+
+    @ub.memoize_property
+    def table(self):
+        rows = [r.to_dict() for r in self.results]
+        table = pd.DataFrame(rows)
+        return table
+
+    def metric_table(self):
+        rows = [r.to_dict() for r in self.results]
+        table = pd.DataFrame(rows)
+        return table
+
+    @ub.memoize_property
+    def varied(self):
+        config_rows = [r.params for r in self.results]
+        sentinel = object()
+        # pd.DataFrame(config_rows).channels
+        varied = dict(ub.varied_values(config_rows, default=sentinel, min_variations=1))
+        # remove nans
+        varied = {
+            k: {v for v in vs if not (isinstance(v, float) and math.isnan(v))}
+            for k, vs in varied.items()}
+        varied = {k: vs for k, vs in varied.items() if len(vs)}
+        return varied
+
+    def abalation_groups(self, param):
+        """
+        Return groups where the specified parameter(s) are varied, but all
+        other non-ignored parameters are held the same.
+
+        Example:
+            >>> self = ResultAnalysis.demo()
+            >>> param = 'param2'
+            >>> self.abalation_groups(param)
+        """
+        if not ub.iterable(param):
+            param = [param]
+        table = self.table
+        config_rows = [r.params for r in self.results]
+        config_keys = list(map(set, config_rows))
+        # if self.params:
+        #     config_keys = list(self.params)
+        if self.ignore_params:
+            config_keys = [c - self.ignore_params for c in config_keys]
+        isect_params = set.intersection(*config_keys)
+        other_params = sorted(isect_params - set(param))
+        groups = []
+        for key, group in table.groupby(other_params, dropna=False):
+            if len(group) > 1:
+                groups.append(group)
+        return groups
+
+    def abalate(self, param):
+        """
+        Example:
+            >>> self = ResultAnalysis.demo(100)
+            >>> param = 'param2'
+            >>> # xdoctest: +REQUIRES(module:openskill)
+            >>> self.abalate(param)
+
+            >>> self = ResultAnalysis.demo()
+            >>> param = ['param2', 'param3']
+            >>> self.abalate(param)
+        """
+        import itertools as it
+        if self.table is None:
+            self.table = self.build_table()
+        if not ub.iterable(param):
+            param = [param]
+
+        # For hashable generic dictionary
+        from collections import namedtuple
+        gd = namedtuple('config', param)
+
+        # from types import SimpleNamespace
+        param_unique_vals_ = self.table[param].drop_duplicates().to_dict('records')
+        param_unique_vals = [gd(**d) for d in param_unique_vals_]
+        # param_unique_vals = {p: self.table[p].unique().tolist() for p in param}
+        score_improvements = ub.ddict(list)
+        scored_obs = []
+        skillboard = SkillTracker(param_unique_vals)
+        groups = self.abalation_groups(param)
+
+        for group in groups:
+            for metric_key in self.metrics:
+                ascending = self._objective_is_ascending(metric_key)
+
+                group = group.sort_values(metric_key, ascending=ascending)
+                subgroups = group.groupby(param)
+                if ascending:
+                    best_idx = subgroups[metric_key].idxmax()
+                else:
+                    best_idx = subgroups[metric_key].idxmin()
+                best_group = group.loc[best_idx]
+                best_group = best_group.sort_values(metric_key, ascending=ascending)
+
+                for x1, x2 in it.product(best_group.index, best_group.index):
+                    if x1 != x2:
+                        r1 = best_group.loc[x1]
+                        r2 = best_group.loc[x2]
+                        k1 = gd(**r1[param])
+                        k2 = gd(**r2[param])
+                        diff = r1[metric_key] - r2[metric_key]
+                        score_improvements[(k1, k2, metric_key)].append(diff)
+
+                # metric_vals = best_group[metric_key].values
+                # diffs = metric_vals[None, :] - metric_vals[:, None]
+                best_group.set_index(param)
+                # best_group[param]
+                # best_group[metric_key].diff()
+                scored_ranking = best_group[param + [metric_key]].reset_index(drop=True)
+                scored_obs.append(scored_ranking)
+                ranking = [gd(**d) for d in scored_ranking[param].to_dict('records')]
+                skillboard.observe(ranking)
+
+        print('skillboard.ratings = {}'.format(ub.repr2(skillboard.ratings, nl=1, align=':')))
+        win_probs = skillboard.predict_win()
+        print('win_probs = {}'.format(ub.repr2(win_probs, nl=1)))
+        for key, improves in score_improvements.items():
+            k1, k2, metric_key = key
+            improves = np.array(improves)
+            pos_delta = improves[improves > 0]
+            print(f'\nWhen {k1} is better than {k2}, the improvement in {metric_key} is')
+            print(pd.DataFrame([pd.Series(pos_delta).describe().T]))
+        return scored_obs
+
+    def _objective_is_ascending(self, metric_key):
+        """
+        Return True if we should minimize the objective (lower is better)
+        Return False if we should maximize the objective (higher is better)
+        """
+        objective = self.metric_objectives.get(metric_key, None)
+        if objective is None:
+            warnings.warn(f'warning assume {self.default_objective} for {metric_key=}')
+            objective = self.default_objective
+        ascending = (objective == 'min')
+        return ascending
+
+    def test_group(self, param_group, metric_key):
+        """
+        Get stats for a particular metric / constant group
+
+        Args:
+            param_group (List[str]): group of parameters to hold constant.
+            metric_key (str): The metric to test.
+
+        Returns:
+            dict
+            # TODO : document these stats clearly and accurately
+
+        Example:
+            >>> self = ResultAnalysis.demo(num=30)
+            >>> print(self.table)
+            >>> param_group = ['param2']
+            >>> metric_key = 'f1'
+            >>> stats_row = self.test_group(param_group, metric_key)
+            >>> print('stats_row = {}'.format(ub.repr2(stats_row, nl=2, precision=2)))
+            >>> # ---
+            >>> self.build()
+            >>> self.report()
+        """
+        param_group_name = ','.join(param_group)
+        stats_row = {
+            'param_name': param_group_name,
+            'metric': metric_key,
+        }
+        # param_values = varied[param_name]
+        # stats_row['param_values'] = param_values
+        ascending = self._objective_is_ascending(metric_key)
+
+        # Find all items with this particular param value
+        value_to_metric_group = {}
+        value_to_metric_stats = {}
+        value_to_metric = {}
+
+        varied_cols = sorted(self.varied.keys())
+
+        # Not sure if this is the right name, these are the other param keys
+        # that we are not directly investigating, but might have an impact.
+        # We use these to select comparable rows for pairwise t-tests
+        nuisance_cols = sorted(set(self.varied.keys()) - set(param_group))
+
+        for param_value, group in self.table.groupby(param_group):
+            metric_group = group[['name', metric_key] + varied_cols]
+            metric_vals = metric_group[metric_key]
+            metric_vals = metric_vals.dropna()
+            if len(metric_vals) > 0:
+                metric_stats = metric_vals.describe()
+                value_to_metric_stats[param_value] = metric_stats
+                value_to_metric_group[param_value] = metric_group
+                value_to_metric[param_value] = metric_vals.values
+
+        moments = pd.DataFrame(value_to_metric_stats).T
+        moments = moments.sort_values('mean', ascending=ascending)
+        moments.index.name = param_group_name
+        moments.columns.name = metric_key
+        ranking = moments['mean'].index.to_list()
+        param_to_rank = ub.invert_dict(dict(enumerate(ranking)))
+
+        # Determine a set of value pairs to do pairwise comparisons on
+        value_pairs = ub.oset()
+        value_pairs.update(map(frozenset, ub.iter_window(moments.index, 2)))
+        value_pairs.update(map(frozenset, ub.iter_window(moments.sort_values('mean', ascending=ascending).index, 2)))
+
+        # https://en.wikipedia.org/wiki/Kruskal%E2%80%93Wallis_one-way_analysis_of_variance
+        # If the researcher can make the assumptions of an identically
+        # shaped and scaled distribution for all groups, except for any
+        # difference in medians, then the null hypothesis is that the
+        # medians of all groups are equal, and the alternative
+        # hypothesis is that at least one population median of one
+        # group is different from the population median of at least one
+        # other group.
+        try:
+            anova_krus_result = scipy.stats.kruskal(*value_to_metric.values())
+        except ValueError:
+            anova_krus_result = scipy.stats.stats.KruskalResult(np.nan, np.nan)
+
+        # https://en.wikipedia.org/wiki/One-way_analysis_of_variance
+        # The One-Way ANOVA tests the null hypothesis, which states
+        # that samples in all groups are drawn from populations with
+        # the same mean values
+        if len(value_to_metric) > 1:
+            anova_1way_result = scipy.stats.f_oneway(*value_to_metric.values())
+        else:
+            anova_1way_result = scipy.stats.stats.F_onewayResult(np.nan, np.nan)
+
+        stats_row['anova_rank_H'] = anova_krus_result.statistic
+        stats_row['anova_rank_p'] = anova_krus_result.pvalue
+        stats_row['anova_mean_F'] = anova_1way_result.statistic
+        stats_row['anova_mean_p'] = anova_1way_result.pvalue
+        stats_row['moments'] = moments
+
+        pairwise_statistics = []
+        for pair in value_pairs:
+            pair_statistics = {}
+            # try:
+            #     param_val1, param_val2 = sorted(pair)
+            # except Exception:
+            #     param_val1, param_val2 = (pair)
+            param_val1, param_val2 = pair
+
+            metric_vals1 = value_to_metric[param_val1]
+            metric_vals2 = value_to_metric[param_val2]
+
+            rank1 = param_to_rank[param_val1]
+            rank2 = param_to_rank[param_val2]
+            pair_statistics['winner'] = param_val1 if rank1 < rank2 else param_val2
+            pair_statistics['value1'] = param_val1
+            pair_statistics['value2'] = param_val2
+            pair_statistics['n1'] = len(metric_vals1)
+            pair_statistics['n2'] = len(metric_vals2)
+            # TODO: probably want to use an alternative=less or greater here
+            # instead of simply unequal
+            alternative = 'two-sided'
+            if 1:
+                if ascending:
+                    # We want to minimize the metric
+                    alternative = 'less' if rank1 < rank2 else 'greater'
+                else:
+                    # We want to maximize the metric
+                    alternative = 'greater' if rank1 < rank2 else 'less'
+
+            ind_kw = dict(
+                equal_var=False,
+                alternative=alternative,
+            )
+            ttest_ind_result = scipy.stats.ttest_ind(metric_vals1, metric_vals2, **ind_kw)
+
+            if 0:
+                from benchmarker.benchmarker import stats_dict
+                stats1 = stats_dict(metric_vals1)
+                stats2 = stats_dict(metric_vals2)
+                scipy.stats.ttest_ind_from_stats(
+                    stats1['mean'], stats1['std'], stats1['n'],
+                    stats2['mean'], stats2['std'], stats2['n'],
+                    **ind_kw
+                )
+                # metric_vals1, metric_vals2, equal_var=False)
+
+            scipy.stats.ttest_ind_from_stats
+
+            pair_statistics['ttest_ind'] = ttest_ind_result
+
+            # Do relative checks, need to find comparable subgroups
+            metric_group1 = value_to_metric_group[param_val1]
+            metric_group2 = value_to_metric_group[param_val2]
+            nuisance_vals1 = metric_group1[nuisance_cols]
+            nuisance_vals2 = metric_group2[nuisance_cols]
+            nk_to_group1 = dict(list(nuisance_vals1.groupby(nuisance_cols)))
+            nk_to_group2 = dict(list(nuisance_vals2.groupby(nuisance_cols)))
+            common = set(nk_to_group1) & set(nk_to_group2)
+            comparable_indexes1 = []
+            comparable_indexes2 = []
+            if common:
+                for nk in common:
+                    group1 = nk_to_group1[nk]
+                    group2 = nk_to_group2[nk]
+                    for i, j in it.product(group1.index, group2.index):
+                        comparable_indexes1.append(i)
+                        comparable_indexes2.append(j)
+
+                comparable_groups1 = metric_group1.loc[comparable_indexes1, metric_key]
+                comparable_groups2 = metric_group2.loc[comparable_indexes2, metric_key]
+
+                # Does this need to have the values aligned?
+                ttest_rel_result = scipy.stats.ttest_rel(comparable_groups1, comparable_groups2)
+                pair_statistics['n_common'] = len(common)
+                pair_statistics['ttest_rel'] = ttest_rel_result
+            pairwise_statistics.append(pair_statistics)
+
+        stats_row['pairwise'] = pairwise_statistics
+        return stats_row
+
+    def build(self):
+        import itertools as it
+        if len(self.results) < 2:
+            raise Exception('need at least 2 results')
+
+        varied = self.varied.copy()
+        if self.ignore_params:
+            for k in self.ignore_params:
+                varied.pop(k, None)
+        if self.params:
+            varied = ub.dict_isect(varied, self.params)
+
+        # Experimental:
+        # Find Auto-abalation groups
+        # TODO: when the group size is -1, instead of showing all of the group
+        # settings, for each group setting do the k=1 analysis within that group
+        varied_param_names = list(varied.keys())
+        num_varied_params = len(varied)
+        held_constant_orders = {num_varied_params + i if i < 0 else i for i in self.abalation_orders}
+        held_constant_orders = [i for i in held_constant_orders if i > 0]
+        held_constant_groups = []
+        for k in held_constant_orders:
+            held_constant_groups.extend(
+                list(map(list, it.combinations(varied_param_names, k))))
+
+        if self.metrics is None:
+            avail_metrics = set.intersection(*[set(r.metrics.keys()) for r in self.results])
+            metrics_of_interest = sorted(avail_metrics - set(self.ignore_metrics))
+        else:
+            metrics_of_interest = self.metrics
+        self.metrics_of_interest = metrics_of_interest
+        self._description['metrics_of_interest'] = metrics_of_interest
+        self._description['num_groups'] = len(held_constant_groups)
+
+        # Analyze the impact of each parameter
+        self.statistics = statistics = []
+        for param_group in held_constant_groups:
+            for metric_key in metrics_of_interest:
+                stats_row = self.test_group(param_group, metric_key)
+                statistics.append(stats_row)
+
+        self.stats_table = pd.DataFrame([
+            ub.dict_diff(d, {'pairwise', 'param_values', 'moments'})
+            for d in self.statistics])
+
+        if len(self.stats_table):
+            self.stats_table = self.stats_table.sort_values('anova_rank_p')
+
+        self._description['built'] = True
+
+    def report(self):
+        p_threshold = 0.05
+        stat_groups = ub.group_items(self.statistics, key=lambda x: x['param_name'])
+        stat_groups_items = list(stat_groups.items())
+
+        # Modify this order to change the grouping pattern
+        grid = ub.named_product({
+            'stat_group_item': stat_groups_items,
+            'metrics': self.metrics_of_interest,
+        })
+        for grid_item in grid:
+            metric_key = grid_item['metrics']
+            stat_groups_item = grid_item['stat_group_item']
+
+            param_name, stat_group = stat_groups_item
+            stats_row = ub.group_items(stat_group, key=lambda x: x['metric'])[metric_key][0]
+            title = ('PARAMETER: {} - METRIC: {}'.format(param_name, metric_key))
+            print('\n\n')
+            print(title)
+            print('=' * len(title))
+            print(stats_row['moments'])
+            anova_rank_p = stats_row['anova_rank_p']
+            anova_mean_p = stats_row['anova_mean_p']
+            # Rougly speaking
+            print('')
+            print(f'ANOVA: If p is low, the param {param_name!r} might have an effect')
+            print(ub.color_text(f'  Rank-ANOVA: p={anova_rank_p:0.8f}', 'green' if anova_rank_p < p_threshold else None))
+            print(ub.color_text(f'  Mean-ANOVA: p={anova_mean_p:0.8f}', 'green' if anova_mean_p < p_threshold else None))
+            print('')
+            print('Pairwise T-Tests')
+            for pairstat in stats_row['pairwise']:
+                # Is this backwards?
+                value1 = pairstat['value1']
+                value2 = pairstat['value2']
+                winner = pairstat['winner']
+                if value2 == winner:
+                    value1, value2 = value2, value1
+                print(f'  If p is low, {param_name}={value1} may outperform {param_name}={value2}.')
+                if 'ttest_ind' in pairstat:
+                    ttest_ind_result = pairstat['ttest_ind']
+                    print(ub.color_text(f'    ttest_ind:  p={ttest_ind_result.pvalue:0.8f}', 'green' if ttest_ind_result.pvalue < p_threshold else None))
+                if 'ttest_rel' in pairstat:
+                    n_common = pairstat['n_common']
+                    ttest_rel_result = pairstat['ttest_ind']
+                    print(ub.color_text(f'    ttest_rel:  p={ttest_rel_result.pvalue:0.8f}, n={n_common}', 'green' if ttest_rel_result.pvalue < p_threshold else None))
+
+        print(self.stats_table)
+
+    def conclusions(self):
+        conclusions = []
+        for stat in self.statistics:
+            param_name = stat['param_name']
+            metric = stat['metric']
+            for pairstat in stat['pairwise']:
+                value1 = pairstat['value1']
+                value2 = pairstat['value2']
+                winner = pairstat['winner']
+                if value2 == winner:
+                    value1, value2 = value2, value1
+                pvalue = stat = pairstat['ttest_ind'].pvalue
+                txt = (f'p={pvalue:0.8f}, If p is low, {param_name}={value1} may outperform {value2} on {metric}.')
+                conclusions.append(txt)
+        return conclusions
+
+
+class SkillTracker:
+    """
+    Wrapper around openskill
+
+    Args:
+        player_ids (List[T]):
+            a list of ids (usually ints) used to represent each player
+
+    Example:
+        >>> # xdoctest: +REQUIRES(module:openskill)
+        >>> self = SkillTracker([1, 2, 3, 4, 5])
+        >>> self.observe([2, 3])  # Player 2 beat player 3.
+        >>> self.observe([1, 2, 5, 3])  # Player 3 didnt play this round.
+        >>> self.observe([2, 3, 4, 5, 1])  # Everyone played, player 2 won.
+        >>> win_probs = self.predict_win()
+        >>> print('win_probs = {}'.format(ub.repr2(win_probs, nl=1, precision=2)))
+        win_probs = {
+            1: 0.20,
+            2: 0.21,
+            3: 0.19,
+            4: 0.20,
+            5: 0.20,
+        }
+    """
+
+    def __init__(self, player_ids):
+        import openskill
+        self.player_ids = player_ids
+        self.ratings = {m: openskill.Rating() for m in player_ids}
+        # self.observations = []
+
+    def predict_win(self):
+        """
+        Estimate the probability that a particular player will win given the
+        current ratings.
+
+        Returns:
+            Dict[T, float]: mapping from player ids to win probabilites
+        """
+        from openskill import predict_win
+        teams = [[p] for p in list(self.ratings.keys())]
+        ratings = [[r] for r in self.ratings.values()]
+        probs = predict_win(ratings)
+        win_probs = {team[0]: prob for team, prob in zip(teams, probs)}
+        return win_probs
+
+    def observe(self, ranking):
+        """
+        After simulating a round, pass the ranked order of who won
+        (winner is first, looser is last) to this function. And it
+        updates the rankings.
+
+        Args:
+            ranking (List[T]):
+                ranking of all the players that played in this round
+                winners are at the front (0-th place) of the list.
+        """
+        import openskill
+        # self.observations.append(ranking)
+        ratings = self.ratings
+        team_standings = [[r] for r in ub.take(ratings, ranking)]
+        # new_values = openskill.rate(team_standings)  # Not inplace
+        # new_ratings = [openskill.Rating(*new[0]) for new in new_values]
+        new_team_ratings = openskill.rate(team_standings)
+        new_ratings = [new[0] for new in new_team_ratings]
+        ratings.update(ub.dzip(ranking, new_ratings))
diff --git a/tests/benchmarker/util_json.py b/tests/benchmarker/util_json.py
new file mode 100644
index 0000000..dc3da85
--- /dev/null
+++ b/tests/benchmarker/util_json.py
@@ -0,0 +1,233 @@
+import copy
+import numpy as np
+import ubelt as ub
+import json
+from collections import OrderedDict
+import pathlib
+
+
+def ensure_json_serializable(dict_, normalize_containers=False, verbose=0):
+    """
+    Attempt to convert common types (e.g. numpy) into something json complient
+
+    Convert numpy and tuples into lists
+
+    Args:
+        normalize_containers (bool, default=False):
+            if True, normalizes dict containers to be standard python
+            structures.
+
+    Example:
+        >>> data = ub.ddict(lambda: int)
+        >>> data['foo'] = ub.ddict(lambda: int)
+        >>> data['bar'] = np.array([1, 2, 3])
+        >>> data['foo']['a'] = 1
+        >>> data['foo']['b'] = (1, np.array([1, 2, 3]), {3: np.int32(3), 4: np.float16(1.0)})
+        >>> dict_ = data
+        >>> print(ub.repr2(data, nl=-1))
+        >>> assert list(find_json_unserializable(data))
+        >>> result = ensure_json_serializable(data, normalize_containers=True)
+        >>> print(ub.repr2(result, nl=-1))
+        >>> assert not list(find_json_unserializable(result))
+        >>> assert type(result) is dict
+    """
+    dict_ = copy.deepcopy(dict_)
+
+    def _norm_container(c):
+        if isinstance(c, dict):
+            # Cast to a normal dictionary
+            if isinstance(c, OrderedDict):
+                if type(c) is not OrderedDict:
+                    c = OrderedDict(c)
+            else:
+                if type(c) is not dict:
+                    c = dict(c)
+        return c
+
+    walker = ub.IndexableWalker(dict_)
+    for prefix, value in walker:
+        if isinstance(value, tuple):
+            new_value = list(value)
+            walker[prefix] = new_value
+        elif isinstance(value, np.ndarray):
+            new_value = value.tolist()
+            walker[prefix] = new_value
+        elif isinstance(value, (np.integer)):
+            new_value = int(value)
+            walker[prefix] = new_value
+        elif isinstance(value, (np.floating)):
+            new_value = float(value)
+            walker[prefix] = new_value
+        elif isinstance(value, (np.complexfloating)):
+            new_value = complex(value)
+            walker[prefix] = new_value
+        elif isinstance(value, pathlib.Path):
+            new_value = str(value)
+            walker[prefix] = new_value
+        elif hasattr(value, '__json__'):
+            new_value = value.__json__()
+            walker[prefix] = new_value
+        elif normalize_containers:
+            if isinstance(value, dict):
+                new_value = _norm_container(value)
+                walker[prefix] = new_value
+
+    if normalize_containers:
+        # normalize the outer layer
+        dict_ = _norm_container(dict_)
+    return dict_
+
+
+def find_json_unserializable(data, quickcheck=False):
+    """
+    Recurse through json datastructure and find any component that
+    causes a serialization error. Record the location of these errors
+    in the datastructure as we recurse through the call tree.
+
+    Args:
+        data (object): data that should be json serializable
+        quickcheck (bool): if True, check the entire datastructure assuming
+            its ok before doing the python-based recursive logic.
+
+    Returns:
+        List[Dict]: list of "bad part" dictionaries containing items
+            'value' - the value that caused the serialization error
+            'loc' - which contains a list of key/indexes that can be used
+                    to lookup the location of the unserializable value.
+                    If the "loc" is a list, then it indicates a rare case where
+                    a key in a dictionary is causing the serialization error.
+
+    Example:
+        >>> part = ub.ddict(lambda: int)
+        >>> part['foo'] = ub.ddict(lambda: int)
+        >>> part['bar'] = np.array([1, 2, 3])
+        >>> part['foo']['a'] = 1
+        >>> # Create a dictionary with two unserializable parts
+        >>> data = [1, 2, {'nest1': [2, part]}, {frozenset({'badkey'}): 3, 2: 4}]
+        >>> parts = list(find_json_unserializable(data))
+        >>> print('parts = {}'.format(ub.repr2(parts, nl=1)))
+        >>> # Check expected structure of bad parts
+        >>> assert len(parts) == 2
+        >>> part = parts[1]
+        >>> assert list(part['loc']) == [2, 'nest1', 1, 'bar']
+        >>> # We can use the "loc" to find the bad value
+        >>> for part in parts:
+        >>>     # "loc" is a list of directions containing which keys/indexes
+        >>>     # to traverse at each descent into the data structure.
+        >>>     directions = part['loc']
+        >>>     curr = data
+        >>>     special_flag = False
+        >>>     for key in directions:
+        >>>         if isinstance(key, list):
+        >>>             # special case for bad keys
+        >>>             special_flag = True
+        >>>             break
+        >>>         else:
+        >>>             # normal case for bad values
+        >>>             curr = curr[key]
+        >>>     if special_flag:
+        >>>         assert part['data'] in curr.keys()
+        >>>         assert part['data'] is key[1]
+        >>>     else:
+        >>>         assert part['data'] is curr
+    """
+    needs_check = True
+    if quickcheck:
+        try:
+            # Might be a more efficient way to do this check. We duplicate a lot of
+            # work by doing the check for unserializable data this way.
+            json.dumps(data)
+        except Exception:
+            # If there is unserializable data, find out where it is.
+            # is_serializable = False
+            pass
+        else:
+            # is_serializable = True
+            needs_check = False
+
+    if needs_check:
+        # mode = 'new'
+        # if mode == 'new':
+        scalar_types = (int, float, str, type(None))
+        container_types = (tuple, list, dict)
+        serializable_types = scalar_types + container_types
+        walker = ub.IndexableWalker(data)
+        for prefix, value in walker:
+            *root, key = prefix
+            if not isinstance(key, scalar_types):
+                # Special case where a dict key is the error value
+                # Purposely make loc non-hashable so its not confused with
+                # an address. All we can know in this case is that they key
+                # is at this level, there is no concept of where.
+                yield {'loc': root + [['.keys', key]], 'data': key}
+            elif not isinstance(value, serializable_types):
+                yield {'loc': prefix, 'data': value}
+
+
+def indexable_allclose(dct1, dct2, return_info=False):
+    """
+    Walks through two nested data structures and ensures that everything is
+    roughly the same.
+
+    Args:
+        dct1: a nested indexable item
+        dct2: a nested indexable item
+
+    Example:
+        >>> dct1 = {
+        >>>     'foo': [1.222222, 1.333],
+        >>>     'bar': 1,
+        >>>     'baz': [],
+        >>> }
+        >>> dct2 = {
+        >>>     'foo': [1.22222, 1.333],
+        >>>     'bar': 1,
+        >>>     'baz': [],
+        >>> }
+        >>> assert indexable_allclose(dct1, dct2)
+    """
+    walker1 = ub.IndexableWalker(dct1)
+    walker2 = ub.IndexableWalker(dct2)
+    flat_items1 = [
+        (path, value) for path, value in walker1
+        if not isinstance(value, walker1.indexable_cls) or len(value) == 0]
+    flat_items2 = [
+        (path, value) for path, value in walker2
+        if not isinstance(value, walker1.indexable_cls) or len(value) == 0]
+
+    flat_items1 = sorted(flat_items1)
+    flat_items2 = sorted(flat_items2)
+
+    if len(flat_items1) != len(flat_items2):
+        info = {
+            'faillist': ['length mismatch']
+        }
+        final_flag = False
+    else:
+        passlist = []
+        faillist = []
+
+        for t1, t2 in zip(flat_items1, flat_items2):
+            p1, v1 = t1
+            p2, v2 = t2
+            assert p1 == p2
+
+            flag = (v1 == v2)
+            if not flag:
+                if isinstance(v1, float) and isinstance(v2, float) and np.isclose(v1, v2):
+                    flag = True
+            if flag:
+                passlist.append(p1)
+            else:
+                faillist.append((p1, v1, v2))
+
+        final_flag = len(faillist) == 0
+        info = {
+            'passlist': passlist,
+            'faillist': faillist,
+        }
+
+    if return_info:
+        return final_flag, info
+    else:
+        return final_flag
diff --git a/tests/benchmarker/visualize.py b/tests/benchmarker/visualize.py
new file mode 100644
index 0000000..41f4679
--- /dev/null
+++ b/tests/benchmarker/visualize.py
@@ -0,0 +1,113 @@
+import pandas as pd
+import ubelt as ub
+
+
+def benchmark_analysis(rows, xlabel, group_labels, basis, ):
+    # xlabel = "size"
+    # Set these to empty lists if they are not used
+    # group_labels = {
+    #     "col": ["input"],
+    #     "hue": ["impl"],
+    #     "size": [],
+    # }
+    # group_keys = {}
+    # for gname, labels in group_labels.items():
+    #     group_keys[gname + "_key"] = ub.repr2(
+    #         ub.dict_isect(params, labels), compact=1, si=1
+    #     )
+    # key = ub.repr2(params, compact=1, si=1)
+
+    from process_tracker.result_analysis import SkillTracker
+    RECORD_ALL = 0
+
+    USE_OPENSKILL = True
+
+    RECORD_ALL = 0
+    metric_key = "time" if RECORD_ALL else "min"
+
+    # The rows define a long-form pandas data array.
+    # Data in long-form makes it very easy to use seaborn.
+    data = pd.DataFrame(rows)
+    data = data.sort_values(metric_key)
+
+    if RECORD_ALL:
+        # Show the min / mean if we record all
+        min_times = data.groupby("key").min().rename({"time": "min"}, axis=1)
+        mean_times = (
+            data.groupby("key")[["time"]].mean().rename({"time": "mean"}, axis=1)
+        )
+        stats_data = pd.concat([min_times, mean_times], axis=1)
+        stats_data = stats_data.sort_values("min")
+    else:
+        stats_data = data
+
+    if USE_OPENSKILL:
+        # Track the "skill" of each method
+        # The idea is that each setting of parameters is a game, and each
+        # "impl" is a player. We rank the players by which is fastest, and
+        # update their ranking according to the Weng-Lin Bayes ranking model.
+        # This does not take the fact that some "games" (i.e.  parameter
+        # settings) are more important than others, but it should be fairly
+        # robust on average.
+        skillboard = SkillTracker(basis["impl"])
+
+    other_keys = sorted(
+        set(stats_data.columns)
+        - {"key", "impl", "min", "mean", "hue_key", "size_key", "style_key"}
+    )
+    for params, variants in stats_data.groupby(other_keys):
+        variants = variants.sort_values("mean")
+        ranking = variants["impl"].reset_index(drop=True)
+
+        mean_speedup = variants["mean"].max() / variants["mean"]
+        stats_data.loc[mean_speedup.index, "mean_speedup"] = mean_speedup
+        min_speedup = variants["min"].max() / variants["min"]
+        stats_data.loc[min_speedup.index, "min_speedup"] = min_speedup
+
+        if USE_OPENSKILL:
+            skillboard.observe(ranking)
+
+    print("Statistics:")
+    print(stats_data)
+
+    if USE_OPENSKILL:
+        win_probs = skillboard.predict_win()
+        win_probs = ub.sorted_vals(win_probs, reverse=True)
+        print(
+            "Aggregated Rankings = {}".format(
+                ub.repr2(win_probs, nl=1, precision=4, align=":")
+            )
+        )
+
+    plot = True
+    if plot:
+        # import seaborn as sns
+        # kwplot autosns works well for IPython and script execution.
+        # not sure about notebooks.
+        import seaborn as sns
+
+        sns.set()
+        from matplotlib import pyplot as plt
+
+        plotkw = {}
+        for gname, labels in group_labels.items():
+            if labels:
+                plotkw[gname] = gname + "_key"
+
+        # Your variables may change
+        # ax = plt.figure().gca()
+        col = plotkw.pop("col")
+        facet = sns.FacetGrid(data, col=col, sharex=False, sharey=False)
+        facet.map_dataframe(sns.lineplot, x=xlabel, y=metric_key, marker="o", **plotkw)
+        facet.add_legend()
+        # sns.lineplot(data=data, )
+        # ax.set_title('JSON Benchmarks')
+        # ax.set_xlabel('Size')
+        # ax.set_ylabel('Time')
+        # ax.set_xscale('log')
+        # ax.set_yscale('log')
+
+        try:
+            __IPYTHON__
+        except NameError:
+            plt.show()

From d036df252ffcc77beec3cbdb0e0f5290eba7195d Mon Sep 17 00:00:00 2001
From: joncrall <jon.crall@kitware.com>
Date: Thu, 26 May 2022 07:36:27 -0400
Subject: [PATCH 03/25] Port datasets

---
 tests/benchmark3.py                  | 217 ++++++++++++++------
 tests/benchmarker/_test_ttest.py     |  28 ---
 tests/benchmarker/benchmarker.py     |  38 ++--
 tests/benchmarker/result_analysis.py | 293 ++++++++++++++++++---------
 4 files changed, 368 insertions(+), 208 deletions(-)
 delete mode 100644 tests/benchmarker/_test_ttest.py

diff --git a/tests/benchmark3.py b/tests/benchmark3.py
index 181b2a4..b6e084e 100644
--- a/tests/benchmark3.py
+++ b/tests/benchmark3.py
@@ -8,58 +8,136 @@ import sys
 import ubelt as ub
 
 
-def data_lut(input, size):
-    if input == "Array with UTF-8 strings":
-        test_object = []
-        for x in range(size):
-            test_object.append(
-                "نظام الحكم سلطاني وراثي "
-                "في الذكور من ذرية السيد تركي بن سعيد بن سلطان ويشترط فيمن يختار لولاية"
-                " الحكم من بينهم ان يكون مسلما رشيدا عاقلا ًوابنا شرعيا لابوين عمانيين "
-            )
+def json_test_data_generators():
+    """
+    Generates data for benchmarks with various sizes
+
+    Returns:
+        Dict[str, callable]:
+            a mapping from test data name to its generator
+
+    Example:
+        >>> data_lut = json_test_data_generators()
+        >>> size = 2
+        >>> keys = sorted(set(data_lut) - {'Complex object'})
+        >>> for key in keys:
+        >>>     func = data_lut[key]
+        >>>     test_object = func(size)
+        >>>     print('key = {!r}'.format(key))
+        >>>     print('test_object = {!r}'.format(test_object))
+    """
+    data_lut = {}
+    def _register_data(name):
+        def _wrap(func):
+            data_lut[name] = func
+        return _wrap
+
+    # seed if desired
+    #rng = random.Random()
+    rng = random
+
+    @_register_data('Array with doubles')
+    def array_with_doubles(size):
+        test_object = [sys.maxsize * rng.random() for _ in range(size)]
         return test_object
-    elif input == "Array with doubles":
-        test_object = []
-        for x in range(256):
-            test_object.append(sys.maxsize * random.random())
-    else:
-        raise KeyError(input)
+
+    @_register_data('Array with UTF-8 strings')
+    def array_with_utf8_strings(size):
+        utf8_string = (
+            "نظام الحكم سلطاني وراثي "
+            "في الذكور من ذرية السيد تركي بن سعيد بن سلطان ويشترط فيمن يختار لولاية"
+            " الحكم من بينهم ان يكون مسلما رشيدا عاقلا ًوابنا شرعيا لابوين عمانيين "
+        )
+        test_object = [utf8_string for _ in range(size)]
+        return test_object
+
+    @_register_data('Medium complex object')
+    def medium_complex_object(size):
+        user = {
+            "userId": 3381293,
+            "age": 213,
+            "username": "johndoe",
+            "fullname": "John Doe the Second",
+            "isAuthorized": True,
+            "liked": 31231.31231202,
+            "approval": 31.1471,
+            "jobs": [1, 2],
+            "currJob": None,
+        }
+        friends = [user, user, user, user, user, user, user, user]
+        test_object = [[user, friends] for _ in range(size)]
+        return test_object
+
+    @_register_data('Array with True values')
+    def true_values(size):
+        test_object = [True for _ in range(size)]
+        return test_object
+
+    @_register_data('Array of Dict[str, int]')
+    def array_of_dict_string_int(size):
+        test_object = [
+            {str(rng.random() * 20): int(rng.random() * 1000000)}
+            for _ in range(size)
+        ]
+        return test_object
+
+    @_register_data('Dict of List[Dict[str, int]]')
+    def dict_of_list_dict_str_int(size):
+        keys = set()
+        while len(keys) < size:
+            key = str(rng.random() * 20)
+            keys.add(key)
+        test_object = {
+            key: [
+                {str(rng.random() * 20): int(rng.random() * 1000000)}
+                for _ in range(256)
+            ]
+            for key in keys
+        }
+        return test_object
+
+    @_register_data('Complex object')
+    def complex_object(size):
+        import json
+        # TODO: might be better to reigster this file with setup.py or
+        # download it via some mechanism
+        try:
+            dpath = ub.Path(__file__).parent
+            fpath = dpath / 'sample.json'
+            if not fpath.exists():
+                raise Exception
+        except Exception:
+            import ujson
+            dpath = ub.Path(ujson.__file__).parent / 'tests'
+            fpath = dpath / 'sample.json'
+            if not fpath.exists():
+                raise Exception
+        with open(fpath, 'r') as f:
+            test_object = json.load(f)
+        if size > 1:
+            test_object = [test_object] * size
+        return test_object
+
+    return data_lut
 
 
 def available_json_impls():
-    JSON_IMPLS = {}
-
-    try:
-        import json
-        JSON_IMPLS["json"] = json
-    except ImportError:
-        pass
-
-    try:
-        import ujson
-        JSON_IMPLS["ujson"] = ujson
-    except ImportError:
-        pass
-
-    try:
-        import nujson
-        JSON_IMPLS["nujson"] = nujson
-    except ImportError:
-        pass
-
-    try:
-        import orjson
-        JSON_IMPLS["nujson"] = orjson
-    except ImportError:
-        pass
-
-    try:
-        import simplejson
-        JSON_IMPLS["simplejson"] = simplejson
-    except ImportError:
-        pass
-
-    return JSON_IMPLS
+    import importlib
+    known_modnames = [
+        'ujson', 'json', 'nujson', 'orjson', 'simplejson'
+    ]
+    json_impls = {}
+    for libname in known_modnames:
+        try:
+            module = importlib.import_module(libname)
+        except ImportError:
+            pass
+        else:
+            json_impls[libname] = {
+                'module': module,
+                'version': module.__version__,
+            }
+    return json_impls
 
 
 def benchmark_json_dumps():
@@ -67,28 +145,34 @@ def benchmark_json_dumps():
     sys.path.append(ub.expandpath('~/code/ultrajson/tests'))
     from benchmarker import Benchmarker
 
-    JSON_IMPLS = available_json_impls()
+    json_impls = available_json_impls()
+    data_lut = json_test_data_generators()
 
-    version_infos = {k: v.__version__ for k, v in JSON_IMPLS.items()}
-
-    def method_lut(impl):
-        return JSON_IMPLS[impl].dumps
+    list(data_lut.keys())
 
     # These are the parameters that we benchmark over
     basis = {
         "input": [
-            "Array with UTF-8 strings",
-            "Array with doubles",
+            'Array with doubles',
+            'Array with UTF-8 strings',
+            # 'Medium complex object',
+            'Array with True values',
+            'Array of Dict[str, int]',
+            # 'Dict of List[Dict[str, int]]',
+            # 'Complex object'
         ],
         "size": [1, 32, 256, 1024, 2048],
-        "impl": list(JSON_IMPLS.keys()),
+        "impl": list(json_impls.keys()),
     }
 
+    # The Benchmarker class is a new experimental API around timerit to
+    # abstract away the details of timing a process over a grid of parameters,
+    # serializing the results, and aggregating results from disparate runs.
     benchmark = Benchmarker(
         name='bench_json_dumps',
-        # Change params here to modify number of trials
         num=100,
         bestof=10,
+        verbose=2,
         basis=basis,
     )
 
@@ -96,11 +180,11 @@ def benchmark_json_dumps():
     for params in benchmark.iter_params():
         # Make any modifications you need to compute input kwargs for each
         # method here.
-        impl = params["impl"]
-        impl_version = version_infos[impl]
+        impl_info = json_impls[params["impl"]]
+        method = impl_info['module'].dumps
+        impl_version = impl_info['version']
         params["impl_version"] = impl_version
-        method = method_lut(impl)
-        data = data_lut(params["input"], params["size"])
+        data = data_lut[params["input"]](params["size"])
         # Timerit will run some user-specified number of loops.
         # and compute time stats with similar methodology to timeit
         for timer in benchmark.measure():
@@ -114,20 +198,25 @@ def benchmark_json_dumps():
     benchmark.dump_in_dpath(dpath)
 
     RECORD_ALL = 0
-    metric_key = "time" if RECORD_ALL else "mean"
+    metric_key = "time" if RECORD_ALL else "mean_time"
 
     from benchmarker import result_analysis
     results = benchmark.result.to_result_list()
+
     analysis = result_analysis.ResultAnalysis(
         results,
         metrics=[metric_key],
         params=['impl'],
         metric_objectives={
-            'min': 'min',
-            'mean': 'min',
+            'min_time': 'min',
+            'mean_time': 'min',
             'time': 'min',
         })
     analysis.analysis()
+    analysis.table
+
+    param_group = ['impl', 'impl_version']
+    analysis.abalate(param_group)
 
     # benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL)
 
diff --git a/tests/benchmarker/_test_ttest.py b/tests/benchmarker/_test_ttest.py
deleted file mode 100644
index 4e83a5d..0000000
--- a/tests/benchmarker/_test_ttest.py
+++ /dev/null
@@ -1,28 +0,0 @@
-
-def check_ttest():
-    import scipy
-    import scipy.stats  # NOQA
-    from benchmarker.benchmarker import stats_dict
-    import numpy as np
-    metric_vals1 = np.random.randn(10000) + 0.01
-    metric_vals2 = np.random.randn(1000)
-
-    stats1 = stats_dict(metric_vals1)
-    stats2 = stats_dict(metric_vals2)
-
-    ind_kw = dict(
-        equal_var=0,
-        # alternative='two-sided'
-        alternative='less' if stats1['mean'] < stats2['mean'] else 'greater'
-    )
-
-    # Not sure why these are slightly different
-    res1 = scipy.stats.ttest_ind(metric_vals1, metric_vals2, **ind_kw)
-
-    res2 = scipy.stats.ttest_ind_from_stats(
-        stats1['mean'], stats1['std'], stats1['n'],
-        stats2['mean'], stats2['std'], stats2['n'],
-        **ind_kw
-    )
-    print('res1 = {!r}'.format(res1))
-    print('res2 = {!r}'.format(res2))
diff --git a/tests/benchmarker/benchmarker.py b/tests/benchmarker/benchmarker.py
index b488fb3..1050d6a 100644
--- a/tests/benchmarker/benchmarker.py
+++ b/tests/benchmarker/benchmarker.py
@@ -8,9 +8,9 @@ from benchmarker.process_context import ProcessContext
 
 @dataclass
 class BenchmarkerConfig:
-    name   : str = None
-    num    : int = 100
-    bestof : int = 10
+    name    : str = None
+    num     : int = 100
+    bestof  : int = 10
 
 
 class BenchmarkerResult:
@@ -97,14 +97,16 @@ class Benchmarker:
         >>> dpath = ub.Path.appdir('benchmarker/demo').ensuredir()
         >>> self.dump_in_dpath(dpath)
     """
-    def __init__(self, basis={}, **kwargs):
+    def __init__(self, basis={}, verbose=1, **kwargs):
         self.basis = basis
 
         self.config = BenchmarkerConfig(**kwargs)
 
         self.ti = timerit.Timerit(
             num=self.config.num,
-            bestof=self.config.bestof)
+            bestof=self.config.bestof,
+            verbose=verbose,
+        )
         self.context = ProcessContext(name=self.config.name)
         self.rows = []
         self.RECORD_ALL = 0
@@ -152,7 +154,7 @@ class Benchmarker:
                 rows.append(row)
         else:
             times = np.array(ti.robust_times())
-            metrics = stats_dict(times)
+            metrics = stats_dict(times, '_time')
             row = {
                 'metrics': metrics,
                 'params': params,
@@ -161,13 +163,13 @@ class Benchmarker:
             rows.append(row)
 
 
-def stats_dict(data):
+def stats_dict(data, suffix=''):
     stats = {
-        'n': len(data),
-        'mean': data.mean(),
-        'std': data.std(),
-        'min': data.min(),
-        'max': data.max(),
+        'nobs' + suffix: len(data),
+        'mean' + suffix: data.mean(),
+        'std' + suffix: data.std(),
+        'min' + suffix: data.min(),
+        'max' + suffix: data.max(),
     }
     return stats
 
@@ -182,12 +184,12 @@ def combine_stats(s1, s2):
 
     Example:
         >>> basis = {
-        >>>     'n1': [1, 10, 100, 10000],
-        >>>     'n2': [1, 10, 100, 10000],
+        >>>     'nobs1': [1, 10, 100, 10000],
+        >>>     'nobs2': [1, 10, 100, 10000],
         >>> }
         >>> for params in ub.named_product(basis):
-        >>>     data1 = np.random.rand(params['n1'])
-        >>>     data2 = np.random.rand(params['n2'])
+        >>>     data1 = np.random.rand(params['nobs1'])
+        >>>     data2 = np.random.rand(params['nobs2'])
         >>>     data3 = np.hstack([data1, data2])
         >>>     s1 = stats_dict(data1)
         >>>     s2 = stats_dict(data2)
@@ -203,7 +205,7 @@ def combine_stats(s1, s2):
         https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups
     """
     stats = [s1, s2]
-    sizes = np.array([s['n'] for s in stats])
+    sizes = np.array([s['nobs'] for s in stats])
     means = np.array([s['mean'] for s in stats])
     stds = np.array([s['std'] for s in stats])
     mins = np.array([s['min'] for s in stats])
@@ -221,7 +223,7 @@ def combine_stats(s1, s2):
     combo_std = np.sqrt(combo_vars)
 
     combo_stats = {
-        'n': combo_size,
+        'nobs': combo_size,
         'mean': combo_mean,
         'std': combo_std,
         'min': mins.min(),
diff --git a/tests/benchmarker/result_analysis.py b/tests/benchmarker/result_analysis.py
index 1067b3e..fd56edb 100644
--- a/tests/benchmarker/result_analysis.py
+++ b/tests/benchmarker/result_analysis.py
@@ -8,6 +8,19 @@ import scipy
 import scipy.stats  # NOQA
 
 
+# a list of common objectives
+DEFAULT_METRIC_TO_OBJECTIVE = {
+    'time': 'min',
+    'ap': 'max',
+    'acc': 'max',
+    'f1': 'max',
+    'mcc': 'max',
+    #
+    'loss': 'min',
+    'brier': 'min',
+}
+
+
 class Result(ub.NiceRepr):
     """
     Storage of names, parameters, and quality metrics for a single experiment.
@@ -31,6 +44,10 @@ class Result(ub.NiceRepr):
         >>> self = Result.demo(rng=32)
         >>> print('self = {}'.format(self))
         self = <Result(name=53f57161,f1=0.33,acc=0.75,param1=1,param2=6.67,param3=a)>
+
+    Example:
+        >>> self = Result.demo(mode='alt', rng=32)
+        >>> print('self = {}'.format(self))
     """
     def __init__(self, name, params, metrics, meta=None):
         self.name = name
@@ -48,21 +65,43 @@ class Result(ub.NiceRepr):
         return text
 
     @classmethod
-    def demo(cls, rng=None):
+    def demo(cls, mode='null', rng=None):
         import numpy as np
         import string
         import kwarray
         rng = kwarray.ensure_rng(rng)
-        demo_param_space = {
-            'param1': list(range(3)),
-            'param2': np.linspace(0, 10, 10),
-            'param3': list(string.ascii_lowercase[0:3]),
-        }
-        params = {k: rng.choice(b) for k, b in demo_param_space.items()}
-        metrics = {
-            'f1': rng.rand(),
-            'acc': rng.rand(),
-        }
+
+        if mode == 'null':
+            # The null hypothesis should generally be true here,
+            # there is no relation between the results and parameters
+            demo_param_space = {
+                'param1': list(range(3)),
+                'param2': np.linspace(0, 10, 10),
+                'param3': list(string.ascii_lowercase[0:3]),
+            }
+            params = {k: rng.choice(b) for k, b in demo_param_space.items()}
+            metrics = {
+                'f1': rng.rand(),
+                'acc': rng.rand(),
+            }
+        elif mode == 'alt':
+            # The alternative hypothesis should be true here, there is a
+            # relationship between results two of the params.
+            from scipy.special import expit
+            params = {
+                'w': rng.randint(-1, 1),
+                'x': rng.randint(-3, 3),
+                'y': rng.randint(-2, 2),
+                'z': rng.randint(-3, 3),
+            }
+            noise = np.random.randn() * 1
+            r = 3 * params['x'] + params['y'] ** 2 + 0.3 * params['z'] ** 3
+            acc = expit(r / 20 + noise)
+            metrics = {
+                'acc': acc,
+            }
+        else:
+            raise KeyError(mode)
         name = ub.hash_data(params)[0:8]
         self = cls(name, params, metrics)
         return self
@@ -105,6 +144,10 @@ class ResultAnalysis(ub.NiceRepr):
         >>> self = ResultAnalysis.demo()
         >>> self.analysis()
 
+    Example:
+        >>> self = ResultAnalysis.demo(num=5000, mode='alt')
+        >>> self.analysis()
+
     Example:
         >>> # Given a list of experiments, configs, and results
         >>> # Create a ResultAnalysis object
@@ -168,7 +211,8 @@ class ResultAnalysis(ub.NiceRepr):
 
     def __init__(self, results, metrics=None, params=None, ignore_params=None,
                  ignore_metrics=None, metric_objectives=None,
-                 abalation_orders={1}, default_objective='max'):
+                 abalation_orders={1}, default_objective='max',
+                 p_threshold=0.05):
         self.results = results
         if ignore_metrics is None:
             ignore_metrics = set()
@@ -181,23 +225,15 @@ class ResultAnalysis(ub.NiceRepr):
         self.default_objective = default_objective
 
         # encode if we want to maximize or minimize a metric
-        default_metric_to_objective = {
-            'ap': 'max',
-            'acc': 'max',
-            'f1': 'max',
-            #
-            'loss': 'min',
-            'brier': 'min',
-        }
         if metric_objectives is None:
             metric_objectives = {}
-
-        self.metric_objectives = default_metric_to_objective.copy()
+        self.metric_objectives = DEFAULT_METRIC_TO_OBJECTIVE.copy()
         self.metric_objectives.update(metric_objectives)
 
         self.params = params
         self.metrics = metrics
         self.statistics = None
+        self.p_threshold = p_threshold
 
         self._description = {}
         self._description['built'] = False
@@ -210,11 +246,14 @@ class ResultAnalysis(ub.NiceRepr):
         return ub.repr2(self._description, si=1, sv=1)
 
     @classmethod
-    def demo(cls, num=10, rng=None):
+    def demo(cls, num=10, mode='null', rng=None):
         import kwarray
         rng = kwarray.ensure_rng(rng)
-        results = [Result.demo(rng=rng) for _ in range(num)]
-        self = cls(results, metrics={'f1', 'acc'})
+        results = [Result.demo(mode=mode, rng=rng) for _ in range(num)]
+        if mode == 'null':
+            self = cls(results, metrics={'f1', 'acc'})
+        else:
+            self = cls(results, metrics={'acc'})
         return self
 
     def run(self):
@@ -251,18 +290,30 @@ class ResultAnalysis(ub.NiceRepr):
         varied = {k: vs for k, vs in varied.items() if len(vs)}
         return varied
 
-    def abalation_groups(self, param):
+    def abalation_groups(self, param_group, k=2):
         """
         Return groups where the specified parameter(s) are varied, but all
         other non-ignored parameters are held the same.
 
+        Args:
+            param_group (str | List[str]):
+                One or more parameters that are allowed to vary
+
+            k (int):
+                minimum number of items a group must contain to be returned
+
+        Returns:
+            List[DataFrame]:
+                a list of subsets of in the table where all but the specified
+                (non-ignored) parameters are allowed to vary.
+
         Example:
             >>> self = ResultAnalysis.demo()
             >>> param = 'param2'
             >>> self.abalation_groups(param)
         """
-        if not ub.iterable(param):
-            param = [param]
+        if not ub.iterable(param_group):
+            param_group = [param_group]
         table = self.table
         config_rows = [r.params for r in self.results]
         config_keys = list(map(set, config_rows))
@@ -271,14 +322,14 @@ class ResultAnalysis(ub.NiceRepr):
         if self.ignore_params:
             config_keys = [c - self.ignore_params for c in config_keys]
         isect_params = set.intersection(*config_keys)
-        other_params = sorted(isect_params - set(param))
+        other_params = sorted(isect_params - set(param_group))
         groups = []
         for key, group in table.groupby(other_params, dropna=False):
-            if len(group) > 1:
+            if len(group) >= k:
                 groups.append(group)
         return groups
 
-    def abalate(self, param):
+    def abalate(self, param_group):
         """
         Example:
             >>> self = ResultAnalysis.demo(100)
@@ -287,34 +338,34 @@ class ResultAnalysis(ub.NiceRepr):
             >>> self.abalate(param)
 
             >>> self = ResultAnalysis.demo()
-            >>> param = ['param2', 'param3']
-            >>> self.abalate(param)
+            >>> param_group = ['param2', 'param3']
+            >>> # xdoctest: +REQUIRES(module:openskill)
+            >>> self.abalate(param_group)
         """
-        import itertools as it
         if self.table is None:
             self.table = self.build_table()
-        if not ub.iterable(param):
-            param = [param]
+        if not ub.iterable(param_group):
+            param_group = [param_group]
 
         # For hashable generic dictionary
         from collections import namedtuple
-        gd = namedtuple('config', param)
+        gd = namedtuple('config', param_group)
 
         # from types import SimpleNamespace
-        param_unique_vals_ = self.table[param].drop_duplicates().to_dict('records')
+        param_unique_vals_ = self.table[param_group].drop_duplicates().to_dict('records')
         param_unique_vals = [gd(**d) for d in param_unique_vals_]
-        # param_unique_vals = {p: self.table[p].unique().tolist() for p in param}
+        # param_unique_vals = {p: self.table[p].unique().tolist() for p in param_group}
         score_improvements = ub.ddict(list)
         scored_obs = []
         skillboard = SkillTracker(param_unique_vals)
-        groups = self.abalation_groups(param)
+        groups = self.abalation_groups(param_group, k=2)
 
         for group in groups:
             for metric_key in self.metrics:
                 ascending = self._objective_is_ascending(metric_key)
 
                 group = group.sort_values(metric_key, ascending=ascending)
-                subgroups = group.groupby(param)
+                subgroups = group.groupby(param_group)
                 if ascending:
                     best_idx = subgroups[metric_key].idxmax()
                 else:
@@ -326,19 +377,19 @@ class ResultAnalysis(ub.NiceRepr):
                     if x1 != x2:
                         r1 = best_group.loc[x1]
                         r2 = best_group.loc[x2]
-                        k1 = gd(**r1[param])
-                        k2 = gd(**r2[param])
+                        k1 = gd(**r1[param_group])
+                        k2 = gd(**r2[param_group])
                         diff = r1[metric_key] - r2[metric_key]
                         score_improvements[(k1, k2, metric_key)].append(diff)
 
                 # metric_vals = best_group[metric_key].values
                 # diffs = metric_vals[None, :] - metric_vals[:, None]
-                best_group.set_index(param)
-                # best_group[param]
+                best_group.set_index(param_group)
+                # best_group[param_group]
                 # best_group[metric_key].diff()
-                scored_ranking = best_group[param + [metric_key]].reset_index(drop=True)
+                scored_ranking = best_group[param_group + [metric_key]].reset_index(drop=True)
                 scored_obs.append(scored_ranking)
-                ranking = [gd(**d) for d in scored_ranking[param].to_dict('records')]
+                ranking = [gd(**d) for d in scored_ranking[param_group].to_dict('records')]
                 skillboard.observe(ranking)
 
         print('skillboard.ratings = {}'.format(ub.repr2(skillboard.ratings, nl=1, align=':')))
@@ -377,15 +428,12 @@ class ResultAnalysis(ub.NiceRepr):
             # TODO : document these stats clearly and accurately
 
         Example:
-            >>> self = ResultAnalysis.demo(num=30)
+            >>> self = ResultAnalysis.demo(num=100)
             >>> print(self.table)
-            >>> param_group = ['param2']
+            >>> param_group = ['param2', 'param1']
             >>> metric_key = 'f1'
             >>> stats_row = self.test_group(param_group, metric_key)
-            >>> print('stats_row = {}'.format(ub.repr2(stats_row, nl=2, precision=2)))
-            >>> # ---
-            >>> self.build()
-            >>> self.report()
+            >>> print('stats_row = {}'.format(ub.repr2(stats_row, nl=2, sort=0, precision=2)))
         """
         param_group_name = ','.join(param_group)
         stats_row = {
@@ -461,10 +509,6 @@ class ResultAnalysis(ub.NiceRepr):
         pairwise_statistics = []
         for pair in value_pairs:
             pair_statistics = {}
-            # try:
-            #     param_val1, param_val2 = sorted(pair)
-            # except Exception:
-            #     param_val1, param_val2 = (pair)
             param_val1, param_val2 = pair
 
             metric_vals1 = value_to_metric[param_val1]
@@ -477,16 +521,17 @@ class ResultAnalysis(ub.NiceRepr):
             pair_statistics['value2'] = param_val2
             pair_statistics['n1'] = len(metric_vals1)
             pair_statistics['n2'] = len(metric_vals2)
-            # TODO: probably want to use an alternative=less or greater here
-            # instead of simply unequal
-            alternative = 'two-sided'
-            if 1:
+
+            TEST_ONLY_FOR_DIFFERENCE = True
+            if TEST_ONLY_FOR_DIFFERENCE:
                 if ascending:
                     # We want to minimize the metric
                     alternative = 'less' if rank1 < rank2 else 'greater'
                 else:
                     # We want to maximize the metric
                     alternative = 'greater' if rank1 < rank2 else 'less'
+            else:
+                alternative = 'two-sided'
 
             ind_kw = dict(
                 equal_var=False,
@@ -499,8 +544,8 @@ class ResultAnalysis(ub.NiceRepr):
                 stats1 = stats_dict(metric_vals1)
                 stats2 = stats_dict(metric_vals2)
                 scipy.stats.ttest_ind_from_stats(
-                    stats1['mean'], stats1['std'], stats1['n'],
-                    stats2['mean'], stats2['std'], stats2['n'],
+                    stats1['mean'], stats1['std'], stats1['nobs'],
+                    stats2['mean'], stats2['std'], stats2['nobs'],
                     **ind_kw
                 )
                 # metric_vals1, metric_vals2, equal_var=False)
@@ -523,6 +568,8 @@ class ResultAnalysis(ub.NiceRepr):
                 for nk in common:
                     group1 = nk_to_group1[nk]
                     group2 = nk_to_group2[nk]
+                    # TODO: Not sure if taking the product of everything within
+                    # the comparable group is correct or not. I think it is ok.
                     for i, j in it.product(group1.index, group2.index):
                         comparable_indexes1.append(i)
                         comparable_indexes2.append(j)
@@ -590,7 +637,6 @@ class ResultAnalysis(ub.NiceRepr):
         self._description['built'] = True
 
     def report(self):
-        p_threshold = 0.05
         stat_groups = ub.group_items(self.statistics, key=lambda x: x['param_name'])
         stat_groups_items = list(stat_groups.items())
 
@@ -600,43 +646,47 @@ class ResultAnalysis(ub.NiceRepr):
             'metrics': self.metrics_of_interest,
         })
         for grid_item in grid:
-            metric_key = grid_item['metrics']
-            stat_groups_item = grid_item['stat_group_item']
-
-            param_name, stat_group = stat_groups_item
-            stats_row = ub.group_items(stat_group, key=lambda x: x['metric'])[metric_key][0]
-            title = ('PARAMETER: {} - METRIC: {}'.format(param_name, metric_key))
-            print('\n\n')
-            print(title)
-            print('=' * len(title))
-            print(stats_row['moments'])
-            anova_rank_p = stats_row['anova_rank_p']
-            anova_mean_p = stats_row['anova_mean_p']
-            # Rougly speaking
-            print('')
-            print(f'ANOVA: If p is low, the param {param_name!r} might have an effect')
-            print(ub.color_text(f'  Rank-ANOVA: p={anova_rank_p:0.8f}', 'green' if anova_rank_p < p_threshold else None))
-            print(ub.color_text(f'  Mean-ANOVA: p={anova_mean_p:0.8f}', 'green' if anova_mean_p < p_threshold else None))
-            print('')
-            print('Pairwise T-Tests')
-            for pairstat in stats_row['pairwise']:
-                # Is this backwards?
-                value1 = pairstat['value1']
-                value2 = pairstat['value2']
-                winner = pairstat['winner']
-                if value2 == winner:
-                    value1, value2 = value2, value1
-                print(f'  If p is low, {param_name}={value1} may outperform {param_name}={value2}.')
-                if 'ttest_ind' in pairstat:
-                    ttest_ind_result = pairstat['ttest_ind']
-                    print(ub.color_text(f'    ttest_ind:  p={ttest_ind_result.pvalue:0.8f}', 'green' if ttest_ind_result.pvalue < p_threshold else None))
-                if 'ttest_rel' in pairstat:
-                    n_common = pairstat['n_common']
-                    ttest_rel_result = pairstat['ttest_ind']
-                    print(ub.color_text(f'    ttest_rel:  p={ttest_rel_result.pvalue:0.8f}, n={n_common}', 'green' if ttest_rel_result.pvalue < p_threshold else None))
+            self._report_one(grid_item)
 
         print(self.stats_table)
 
+    def _report_one(self, grid_item):
+        p_threshold = self.p_threshold
+        metric_key = grid_item['metrics']
+        stat_groups_item = grid_item['stat_group_item']
+
+        param_name, stat_group = stat_groups_item
+        stats_row = ub.group_items(stat_group, key=lambda x: x['metric'])[metric_key][0]
+        title = ('PARAMETER: {} - METRIC: {}'.format(param_name, metric_key))
+        print('\n\n')
+        print(title)
+        print('=' * len(title))
+        print(stats_row['moments'])
+        anova_rank_p = stats_row['anova_rank_p']
+        anova_mean_p = stats_row['anova_mean_p']
+        # Rougly speaking
+        print('')
+        print(f'ANOVA: If p is low, the param {param_name!r} might have an effect')
+        print(ub.color_text(f'  Rank-ANOVA: p={anova_rank_p:0.8f}', 'green' if anova_rank_p < p_threshold else None))
+        print(ub.color_text(f'  Mean-ANOVA: p={anova_mean_p:0.8f}', 'green' if anova_mean_p < p_threshold else None))
+        print('')
+        print('Pairwise T-Tests')
+        for pairstat in stats_row['pairwise']:
+            # Is this backwards?
+            value1 = pairstat['value1']
+            value2 = pairstat['value2']
+            winner = pairstat['winner']
+            if value2 == winner:
+                value1, value2 = value2, value1
+            print(f'  If p is low, {param_name}={value1} may outperform {param_name}={value2}.')
+            if 'ttest_ind' in pairstat:
+                ttest_ind_result = pairstat['ttest_ind']
+                print(ub.color_text(f'    ttest_ind:  p={ttest_ind_result.pvalue:0.8f}', 'green' if ttest_ind_result.pvalue < p_threshold else None))
+            if 'ttest_rel' in pairstat:
+                n_common = pairstat['n_common']
+                ttest_rel_result = pairstat['ttest_ind']
+                print(ub.color_text(f'    ttest_rel:  p={ttest_rel_result.pvalue:0.8f}, n_pairs={n_common}', 'green' if ttest_rel_result.pvalue < p_threshold else None))
+
     def conclusions(self):
         conclusions = []
         for stat in self.statistics:
@@ -653,6 +703,50 @@ class ResultAnalysis(ub.NiceRepr):
                 conclusions.append(txt)
         return conclusions
 
+    def plot(self, xlabel, metric_key, group_labels):
+        """
+        Example:
+            >>> self = ResultAnalysis.demo(num=5000, mode='alt')
+            >>> self.analysis()
+            >>> print('self = {}'.format(self))
+            >>> # xdoctest: +REQUIRES(module:kwplot)
+            >>> import kwplot
+            >>> kwplot.autompl()
+            >>> xlabel = 'x'
+            >>> metric_key = 'acc'
+            >>> group_labels = {
+            >>>     'col': ['y', 'w'],
+            >>>     'hue': ['z'],
+            >>>     'size': [],
+            >>> }
+            >>> self.plot(xlabel, metric_key, group_labels)
+        """
+        import seaborn as sns
+        sns.set()
+        from matplotlib import pyplot as plt  # NOQA
+        data = self.table
+        data = data.sort_values(metric_key)
+        for gname, labels in group_labels.items():
+            if len(labels):
+                new_col = []
+                for row in data[labels].to_dict('records'):
+                    item = ub.repr2(row, compact=1, si=1)
+                    new_col.append(item)
+                gkey = gname + "_key"
+                data[gkey] = new_col
+
+        plotkw = {}
+        for gname, labels in group_labels.items():
+            if labels:
+                plotkw[gname] = gname + "_key"
+
+        # Your variables may change
+        # ax = plt.figure().gca()
+        col = plotkw.pop("col")
+        facet = sns.FacetGrid(data, col=col, sharex=False, sharey=False)
+        facet.map_dataframe(sns.lineplot, x=xlabel, y=metric_key, marker="o", **plotkw)
+        facet.add_legend()
+
 
 class SkillTracker:
     """
@@ -677,6 +771,9 @@ class SkillTracker:
             4: 0.20,
             5: 0.20,
         }
+
+    Requirements:
+        openskill
     """
 
     def __init__(self, player_ids):

From daf8913cc248ce5aae6e0292489230d5613adb7b Mon Sep 17 00:00:00 2001
From: joncrall <jon.crall@kitware.com>
Date: Thu, 26 May 2022 10:09:33 -0400
Subject: [PATCH 04/25] log scale

---
 tests/benchmark3.py                  | 19 +++++++++++++++++--
 tests/benchmarker/result_analysis.py | 15 ++++++++++-----
 2 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/tests/benchmark3.py b/tests/benchmark3.py
index b6e084e..2563d4d 100644
--- a/tests/benchmark3.py
+++ b/tests/benchmark3.py
@@ -217,13 +217,28 @@ def benchmark_json_dumps():
 
     param_group = ['impl', 'impl_version']
     analysis.abalate(param_group)
-
     # benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL)
 
+    xlabel = "size"
+    # Set these to empty lists if they are not used
+    group_labels = {
+        "col": ["input"],
+        "hue": ["impl"],
+        "size": [],
+    }
+    import kwplot
+    kwplot.autompl()
+    facet = analysis.plot(xlabel, metric_key, group_labels)
+    for ax in facet.axes.ravel():
+        ax.set_xscale('log')
+        ax.set_yscale('log')
+    print('facet = {!r}'.format(facet))
+    kwplot.show_if_requested()
+
 
 if __name__ == "__main__":
     """
     CommandLine:
-        python ~/code/ultrajson/tests/benchmark3.py
+        python ~/code/ultrajson/tests/benchmark3.py --show
     """
     benchmark_json_dumps()
diff --git a/tests/benchmarker/result_analysis.py b/tests/benchmarker/result_analysis.py
index fd56edb..d6f474e 100644
--- a/tests/benchmarker/result_analysis.py
+++ b/tests/benchmarker/result_analysis.py
@@ -240,9 +240,6 @@ class ResultAnalysis(ub.NiceRepr):
         self._description['num_results'] = len(self.results)
 
     def __nice__(self):
-        # if len(self._description) == 0:
-        #     return 'unbuilt'
-        # else:
         return ub.repr2(self._description, si=1, sv=1)
 
     @classmethod
@@ -405,8 +402,13 @@ class ResultAnalysis(ub.NiceRepr):
 
     def _objective_is_ascending(self, metric_key):
         """
-        Return True if we should minimize the objective (lower is better)
-        Return False if we should maximize the objective (higher is better)
+        Args:
+            metric_key (str): the metric in question
+
+        Returns:
+            bool:
+                True if we should minimize the objective (lower is better)
+                False if we should maximize the objective (higher is better)
         """
         objective = self.metric_objectives.get(metric_key, None)
         if objective is None:
@@ -578,6 +580,8 @@ class ResultAnalysis(ub.NiceRepr):
                 comparable_groups2 = metric_group2.loc[comparable_indexes2, metric_key]
 
                 # Does this need to have the values aligned?
+                # I think that is the case giving my understanding of paired
+                # t-tests, but the docs need a PR to make that more clear.
                 ttest_rel_result = scipy.stats.ttest_rel(comparable_groups1, comparable_groups2)
                 pair_statistics['n_common'] = len(common)
                 pair_statistics['ttest_rel'] = ttest_rel_result
@@ -746,6 +750,7 @@ class ResultAnalysis(ub.NiceRepr):
         facet = sns.FacetGrid(data, col=col, sharex=False, sharey=False)
         facet.map_dataframe(sns.lineplot, x=xlabel, y=metric_key, marker="o", **plotkw)
         facet.add_legend()
+        return facet
 
 
 class SkillTracker:

From 68c4a55284318969a17ada11e6935935f007fe8f Mon Sep 17 00:00:00 2001
From: joncrall <jon.crall@kitware.com>
Date: Thu, 26 May 2022 16:03:23 -0400
Subject: [PATCH 05/25] Reorganize as separate module

---
 json_benchmarks/__init__.py                   |   0
 json_benchmarks/__main__.py                   |   8 +
 json_benchmarks/benchmarker/__init__.py       |  39 +++
 .../benchmarker/aggregate.py                  |   6 +-
 .../benchmarker/benchmarker.py                |  28 +-
 .../benchmarker/process_context.py            |  17 +-
 .../benchmarker/result_analysis.py            | 176 +++++++++----
 .../benchmarker/util_json.py                  |   0
 .../benchmarker/visualize.py                  |   0
 json_benchmarks/core.py                       | 248 ++++++++++++++++++
 json_benchmarks/datagen.py                    | 115 ++++++++
 tests/benchmark3.py                           | 244 -----------------
 tests/benchmarker/__init__.py                 |  35 ---
 13 files changed, 568 insertions(+), 348 deletions(-)
 create mode 100644 json_benchmarks/__init__.py
 create mode 100644 json_benchmarks/__main__.py
 create mode 100644 json_benchmarks/benchmarker/__init__.py
 rename {tests => json_benchmarks}/benchmarker/aggregate.py (89%)
 rename {tests => json_benchmarks}/benchmarker/benchmarker.py (91%)
 rename {tests => json_benchmarks}/benchmarker/process_context.py (87%)
 rename {tests => json_benchmarks}/benchmarker/result_analysis.py (87%)
 rename {tests => json_benchmarks}/benchmarker/util_json.py (100%)
 rename {tests => json_benchmarks}/benchmarker/visualize.py (100%)
 create mode 100644 json_benchmarks/core.py
 create mode 100644 json_benchmarks/datagen.py
 delete mode 100644 tests/benchmark3.py
 delete mode 100644 tests/benchmarker/__init__.py

diff --git a/json_benchmarks/__init__.py b/json_benchmarks/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/json_benchmarks/__main__.py b/json_benchmarks/__main__.py
new file mode 100644
index 0000000..faf03f6
--- /dev/null
+++ b/json_benchmarks/__main__.py
@@ -0,0 +1,8 @@
+
+if __name__ == '__main__':
+    """
+    CommandLine:
+        python -m json_benchmarks
+    """
+    from json_benchmarks import core
+    core.main()
diff --git a/json_benchmarks/benchmarker/__init__.py b/json_benchmarks/benchmarker/__init__.py
new file mode 100644
index 0000000..8abb4c5
--- /dev/null
+++ b/json_benchmarks/benchmarker/__init__.py
@@ -0,0 +1,39 @@
+"""
+A helper module for executing, serializing, combining, and comparing benchmarks
+"""
+
+__mkinit__ = """
+# Autogenerate this file
+mkinit ~/code/ultrajson/json_benchmarks/benchmarker/__init__.py -w
+"""
+
+__version__ = '0.1.0'
+
+from json_benchmarks.benchmarker import aggregate
+from json_benchmarks.benchmarker import benchmarker
+from json_benchmarks.benchmarker import process_context
+from json_benchmarks.benchmarker import result_analysis
+from json_benchmarks.benchmarker import util_json
+from json_benchmarks.benchmarker import visualize
+
+from json_benchmarks.benchmarker.aggregate import (demo, demo_data,)
+from json_benchmarks.benchmarker.benchmarker import (Benchmarker,
+                                                     BenchmarkerConfig,
+                                                     BenchmarkerResult,
+                                                     combine_stats,
+                                                     stats_dict,)
+from json_benchmarks.benchmarker.process_context import (ProcessContext,)
+from json_benchmarks.benchmarker.result_analysis import (
+    DEFAULT_METRIC_TO_OBJECTIVE, Result, ResultAnalysis, SkillTracker,)
+from json_benchmarks.benchmarker.util_json import (ensure_json_serializable,
+                                                   find_json_unserializable,
+                                                   indexable_allclose,)
+from json_benchmarks.benchmarker.visualize import (benchmark_analysis,)
+
+__all__ = ['Benchmarker', 'BenchmarkerConfig', 'BenchmarkerResult',
+           'DEFAULT_METRIC_TO_OBJECTIVE', 'ProcessContext', 'Result',
+           'ResultAnalysis', 'SkillTracker', 'aggregate', 'benchmark_analysis',
+           'benchmarker', 'combine_stats', 'demo', 'demo_data',
+           'ensure_json_serializable', 'find_json_unserializable',
+           'indexable_allclose', 'process_context', 'result_analysis',
+           'stats_dict', 'util_json', 'visualize']
diff --git a/tests/benchmarker/aggregate.py b/json_benchmarks/benchmarker/aggregate.py
similarity index 89%
rename from tests/benchmarker/aggregate.py
rename to json_benchmarks/benchmarker/aggregate.py
index 41d11e8..b2d74c9 100644
--- a/tests/benchmarker/aggregate.py
+++ b/json_benchmarks/benchmarker/aggregate.py
@@ -4,7 +4,7 @@ import ubelt as ub
 
 
 def demo_data():
-    from benchmarker.benchmarker import Benchmarker
+    from json_benchmarks.benchmarker.benchmarker import Benchmarker
     import numpy as np
     impl_lut = {
         'numpy': np.sum,
@@ -43,8 +43,8 @@ def demo_data():
 
 
 def demo():
-    from benchmarker import BenchmarkerResult
-    from benchmarker import result_analysis
+    from json_benchmarks.benchmarker import BenchmarkerResult
+    from json_benchmarks.benchmarker import result_analysis
     fpaths = demo_data()
 
     results = []
diff --git a/tests/benchmarker/benchmarker.py b/json_benchmarks/benchmarker/benchmarker.py
similarity index 91%
rename from tests/benchmarker/benchmarker.py
rename to json_benchmarks/benchmarker/benchmarker.py
index 1050d6a..6ff05d5 100644
--- a/tests/benchmarker/benchmarker.py
+++ b/json_benchmarks/benchmarker/benchmarker.py
@@ -3,7 +3,7 @@ import timerit
 import ubelt as ub
 import numpy as np
 from dataclasses import dataclass
-from benchmarker.process_context import ProcessContext
+from json_benchmarks.benchmarker.process_context import ProcessContext
 
 
 @dataclass
@@ -49,7 +49,7 @@ class BenchmarkerResult:
         Returns:
             List[Result]
         """
-        from benchmarker import result_analysis
+        from json_benchmarks.benchmarker import result_analysis
         results = []
         for row in self.rows:
             result = result_analysis.Result(
@@ -69,9 +69,6 @@ class Benchmarker:
     Helper to organize the execution and serialization of a benchmark
 
     Example:
-        >>> import sys, ubelt
-        >>> sys.path.append(ubelt.expandpath('~/code/ultrajson/tests'))
-        >>> from benchmarker.benchmarker import *  # NOQA
         >>> import numpy as np
         >>> impl_lut = {
         >>>     'numpy': np.sum,
@@ -205,11 +202,22 @@ def combine_stats(s1, s2):
         https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups
     """
     stats = [s1, s2]
-    sizes = np.array([s['nobs'] for s in stats])
-    means = np.array([s['mean'] for s in stats])
-    stds = np.array([s['std'] for s in stats])
-    mins = np.array([s['min'] for s in stats])
-    maxs = np.array([s['max'] for s in stats])
+    data = {
+        'nobs': np.array([s['nobs'] for s in stats]),
+        'mean': np.array([s['mean'] for s in stats]),
+        'std': np.array([s['std'] for s in stats]),
+        'min': np.array([s['min'] for s in stats]),
+        'max': np.array([s['max'] for s in stats]),
+    }
+    combine_stats_arrs(data)
+
+
+def combine_stats_arrs(data):
+    sizes = data['nobs']
+    means = data['mean']
+    stds = data['std']
+    mins = data['min']
+    maxs = data['max']
     varis = stds * stds
 
     combo_size = sizes.sum()
diff --git a/tests/benchmarker/process_context.py b/json_benchmarks/benchmarker/process_context.py
similarity index 87%
rename from tests/benchmarker/process_context.py
rename to json_benchmarks/benchmarker/process_context.py
index e198f9c..bce02c0 100644
--- a/tests/benchmarker/process_context.py
+++ b/json_benchmarks/benchmarker/process_context.py
@@ -9,11 +9,10 @@ class ProcessContext:
     Context manager to track the context under which a result was computed
 
     Example:
-        >>> import sys, ubelt
-        >>> sys.path.append(ubelt.expandpath('~/code/ultrajson/tests'))
-        >>> from benchmarker.process_context import *  # NOQA
+        >>> from json_benchmarks.benchmarker.process_context import *  # NOQA
         >>> self = ProcessContext()
         >>> obj = self.start().stop()
+        >>> print('obj = {}'.format(ub.repr2(obj, nl=2)))
     """
 
     def __init__(self, name=None, args=None, config=None):
@@ -69,15 +68,19 @@ class ProcessContext:
             'mem_total': svmem_info.total,
         }
 
-    # def _cpuinfo(self):
-    #     import cpuinfo
-    #     cpu_info = cpuinfo.get_cpu_info()
-    #     return cpu_info
+    def _cpuinfo(self):
+        import cpuinfo
+        _cpu_info = cpuinfo.get_cpu_info()
+        cpu_info = {
+            'cpu_brand': _cpu_info['brand_raw'],
+        }
+        return cpu_info
 
     def _machine(self):
         return ub.dict_union(
             self._hostinfo(),
             self._meminfo(),
+            self._cpuinfo(),
             self._osinfo(),
             self._pyinfo(),
         )
diff --git a/tests/benchmarker/result_analysis.py b/json_benchmarks/benchmarker/result_analysis.py
similarity index 87%
rename from tests/benchmarker/result_analysis.py
rename to json_benchmarks/benchmarker/result_analysis.py
index d6f474e..e07d027 100644
--- a/tests/benchmarker/result_analysis.py
+++ b/json_benchmarks/benchmarker/result_analysis.py
@@ -89,10 +89,11 @@ class Result(ub.NiceRepr):
             # relationship between results two of the params.
             from scipy.special import expit
             params = {
-                'w': rng.randint(-1, 1),
-                'x': rng.randint(-3, 3),
-                'y': rng.randint(-2, 2),
-                'z': rng.randint(-3, 3),
+                'u': rng.randint(0, 1 + 1),
+                'v': rng.randint(-1, 1 + 1),
+                'x': rng.randint(-2, 3 + 1),
+                'y': rng.randint(-1, 2 + 1),
+                'z': rng.randint(-0, 3 + 1),
             }
             noise = np.random.randn() * 1
             r = 3 * params['x'] + params['y'] ** 2 + 0.3 * params['z'] ** 3
@@ -326,8 +327,28 @@ class ResultAnalysis(ub.NiceRepr):
                 groups.append(group)
         return groups
 
+    def _objective_is_ascending(self, metric_key):
+        """
+        Args:
+            metric_key (str): the metric in question
+
+        Returns:
+            bool:
+                True if we should minimize the objective (lower is better)
+                False if we should maximize the objective (higher is better)
+        """
+        objective = self.metric_objectives.get(metric_key, None)
+        if objective is None:
+            warnings.warn(f'warning assume {self.default_objective} for {metric_key=}')
+            objective = self.default_objective
+        ascending = (objective == 'min')
+        return ascending
+
     def abalate(self, param_group):
         """
+        TODO:
+            rectify with test-group
+
         Example:
             >>> self = ResultAnalysis.demo(100)
             >>> param = 'param2'
@@ -400,23 +421,6 @@ class ResultAnalysis(ub.NiceRepr):
             print(pd.DataFrame([pd.Series(pos_delta).describe().T]))
         return scored_obs
 
-    def _objective_is_ascending(self, metric_key):
-        """
-        Args:
-            metric_key (str): the metric in question
-
-        Returns:
-            bool:
-                True if we should minimize the objective (lower is better)
-                False if we should maximize the objective (higher is better)
-        """
-        objective = self.metric_objectives.get(metric_key, None)
-        if objective is None:
-            warnings.warn(f'warning assume {self.default_objective} for {metric_key=}')
-            objective = self.default_objective
-        ascending = (objective == 'min')
-        return ascending
-
     def test_group(self, param_group, metric_key):
         """
         Get stats for a particular metric / constant group
@@ -477,8 +481,10 @@ class ResultAnalysis(ub.NiceRepr):
 
         # Determine a set of value pairs to do pairwise comparisons on
         value_pairs = ub.oset()
-        value_pairs.update(map(frozenset, ub.iter_window(moments.index, 2)))
-        value_pairs.update(map(frozenset, ub.iter_window(moments.sort_values('mean', ascending=ascending).index, 2)))
+        # value_pairs.update(
+        #     map(frozenset, ub.iter_window(moments.index, 2)))
+        value_pairs.update(map(frozenset, ub.iter_window(
+                moments.sort_values('mean', ascending=ascending).index, 2)))
 
         # https://en.wikipedia.org/wiki/Kruskal%E2%80%93Wallis_one-way_analysis_of_variance
         # If the researcher can make the assumptions of an identically
@@ -508,9 +514,9 @@ class ResultAnalysis(ub.NiceRepr):
         stats_row['anova_mean_p'] = anova_1way_result.pvalue
         stats_row['moments'] = moments
 
-        pairwise_statistics = []
+        pair_stats_list = []
         for pair in value_pairs:
-            pair_statistics = {}
+            pair_stats = {}
             param_val1, param_val2 = pair
 
             metric_vals1 = value_to_metric[param_val1]
@@ -518,11 +524,11 @@ class ResultAnalysis(ub.NiceRepr):
 
             rank1 = param_to_rank[param_val1]
             rank2 = param_to_rank[param_val2]
-            pair_statistics['winner'] = param_val1 if rank1 < rank2 else param_val2
-            pair_statistics['value1'] = param_val1
-            pair_statistics['value2'] = param_val2
-            pair_statistics['n1'] = len(metric_vals1)
-            pair_statistics['n2'] = len(metric_vals2)
+            pair_stats['winner'] = param_val1 if rank1 < rank2 else param_val2
+            pair_stats['value1'] = param_val1
+            pair_stats['value2'] = param_val2
+            pair_stats['n1'] = len(metric_vals1)
+            pair_stats['n2'] = len(metric_vals2)
 
             TEST_ONLY_FOR_DIFFERENCE = True
             if TEST_ONLY_FOR_DIFFERENCE:
@@ -554,7 +560,7 @@ class ResultAnalysis(ub.NiceRepr):
 
             scipy.stats.ttest_ind_from_stats
 
-            pair_statistics['ttest_ind'] = ttest_ind_result
+            pair_stats['ttest_ind'] = ttest_ind_result
 
             # Do relative checks, need to find comparable subgroups
             metric_group1 = value_to_metric_group[param_val1]
@@ -583,11 +589,11 @@ class ResultAnalysis(ub.NiceRepr):
                 # I think that is the case giving my understanding of paired
                 # t-tests, but the docs need a PR to make that more clear.
                 ttest_rel_result = scipy.stats.ttest_rel(comparable_groups1, comparable_groups2)
-                pair_statistics['n_common'] = len(common)
-                pair_statistics['ttest_rel'] = ttest_rel_result
-            pairwise_statistics.append(pair_statistics)
+                pair_stats['n_common'] = len(common)
+                pair_stats['ttest_rel'] = ttest_rel_result
+            pair_stats_list.append(pair_stats)
 
-        stats_row['pairwise'] = pairwise_statistics
+        stats_row['pairwise'] = pair_stats_list
         return stats_row
 
     def build(self):
@@ -671,8 +677,10 @@ class ResultAnalysis(ub.NiceRepr):
         # Rougly speaking
         print('')
         print(f'ANOVA: If p is low, the param {param_name!r} might have an effect')
-        print(ub.color_text(f'  Rank-ANOVA: p={anova_rank_p:0.8f}', 'green' if anova_rank_p < p_threshold else None))
-        print(ub.color_text(f'  Mean-ANOVA: p={anova_mean_p:0.8f}', 'green' if anova_mean_p < p_threshold else None))
+        print(ub.color_text(f'  Rank-ANOVA: p={anova_rank_p:0.8f}',
+                            'green' if anova_rank_p < p_threshold else None))
+        print(ub.color_text(f'  Mean-ANOVA: p={anova_mean_p:0.8f}',
+                            'green' if anova_mean_p < p_threshold else None))
         print('')
         print('Pairwise T-Tests')
         for pairstat in stats_row['pairwise']:
@@ -685,11 +693,13 @@ class ResultAnalysis(ub.NiceRepr):
             print(f'  If p is low, {param_name}={value1} may outperform {param_name}={value2}.')
             if 'ttest_ind' in pairstat:
                 ttest_ind_result = pairstat['ttest_ind']
-                print(ub.color_text(f'    ttest_ind:  p={ttest_ind_result.pvalue:0.8f}', 'green' if ttest_ind_result.pvalue < p_threshold else None))
+                print(ub.color_text(f'    ttest_ind:  p={ttest_ind_result.pvalue:0.8f}',
+                                    'green' if ttest_ind_result.pvalue < p_threshold else None))
             if 'ttest_rel' in pairstat:
                 n_common = pairstat['n_common']
                 ttest_rel_result = pairstat['ttest_ind']
-                print(ub.color_text(f'    ttest_rel:  p={ttest_rel_result.pvalue:0.8f}, n_pairs={n_common}', 'green' if ttest_rel_result.pvalue < p_threshold else None))
+                print(ub.color_text(f'    ttest_rel:  p={ttest_rel_result.pvalue:0.8f}, n_pairs={n_common}',
+                                    'green' if ttest_rel_result.pvalue < p_threshold else None))
 
     def conclusions(self):
         conclusions = []
@@ -709,17 +719,31 @@ class ResultAnalysis(ub.NiceRepr):
 
     def plot(self, xlabel, metric_key, group_labels):
         """
+        Args:
+            group_labels (dict):
+                Tells seaborn what attributes to use to distinsuish curves like
+                hue, size, marker. Also can contain "col" for use with
+                FacetGrid, and "fig" to separate different configurations into
+                different figures.
+
+        Returns:
+            List[Dict]:
+                A list for each figure containing info abou that figure for any
+                postprocessing.
+
         Example:
-            >>> self = ResultAnalysis.demo(num=5000, mode='alt')
+            >>> self = ResultAnalysis.demo(num=1000, mode='alt')
             >>> self.analysis()
             >>> print('self = {}'.format(self))
+            >>> print('self.varied = {}'.format(ub.repr2(self.varied, nl=1)))
             >>> # xdoctest: +REQUIRES(module:kwplot)
             >>> import kwplot
-            >>> kwplot.autompl()
+            >>> kwplot.autosns()
             >>> xlabel = 'x'
             >>> metric_key = 'acc'
             >>> group_labels = {
-            >>>     'col': ['y', 'w'],
+            >>>     'fig': ['u'],
+            >>>     'col': ['y', 'v'],
             >>>     'hue': ['z'],
             >>>     'size': [],
             >>> }
@@ -739,18 +763,72 @@ class ResultAnalysis(ub.NiceRepr):
                 gkey = gname + "_key"
                 data[gkey] = new_col
 
-        plotkw = {}
+        plot_kws = {
+            'x': xlabel,
+            'y': metric_key,
+        }
         for gname, labels in group_labels.items():
             if labels:
-                plotkw[gname] = gname + "_key"
+                plot_kws[gname] = gname + "_key"
 
         # Your variables may change
         # ax = plt.figure().gca()
-        col = plotkw.pop("col")
-        facet = sns.FacetGrid(data, col=col, sharex=False, sharey=False)
-        facet.map_dataframe(sns.lineplot, x=xlabel, y=metric_key, marker="o", **plotkw)
-        facet.add_legend()
-        return facet
+        fig_params = plot_kws.pop("fig", [])
+
+        facet_kws = {
+            'sharex': False,
+            'sharey': False,
+        }
+        # facet_kws['col'] = plot_kws.pop("col", None)
+        # facet_kws['row'] = plot_kws.pop("row", None)
+        # if not facet_kws['row']:
+        #     facet_kws['col_wrap'] = 5
+        plot_kws['row'] = plot_kws.get("row", None)
+        # if not plot_kws['row']:
+        #     plot_kws['col_wrap'] = 5
+
+        if not fig_params:
+            groups = [('', data)]
+        else:
+            groups = data.groupby(fig_params)
+
+        if 'marker' not in plot_kws:
+            plot_kws['marker'] = "o"
+
+        plot_kws['ci'] = "sd"
+
+        # Use a consistent pallete across plots
+        unique_hues = data['hue_key'].unique()
+        palette = ub.dzip(unique_hues, sns.color_palette(n_colors=len(unique_hues)))
+        plot_kws['palette'] = palette
+
+        plots = []
+        base_fnum = 1
+        for fnum, (fig_key, group) in enumerate(groups, start=base_fnum):
+            # TODO: seaborn doesn't give us any option to reuse an existing
+            # figure or even specify what it's handle should be. A patch should
+            # be submitted to add that feature, but in the meantime work around
+            # it and use the figures they give us.
+
+            # fig = plt.figure(fnum)
+            # fig.clf()
+
+            facet = sns.relplot(
+                data=group, kind='line',
+                facet_kws=facet_kws,
+                **plot_kws)
+
+            fig = facet.figure
+            fig.suptitle(fig_key)
+            fig.tight_layout()
+            # facet = sns.FacetGrid(group, **facet_kws)
+            # facet.map_dataframe(sns.lineplot, x=xlabel, y=metric_key, **plot_kws)
+            # facet.add_legend()
+            plots.append({
+                'fig': fig,
+                'facet': facet,
+            })
+        return plots
 
 
 class SkillTracker:
diff --git a/tests/benchmarker/util_json.py b/json_benchmarks/benchmarker/util_json.py
similarity index 100%
rename from tests/benchmarker/util_json.py
rename to json_benchmarks/benchmarker/util_json.py
diff --git a/tests/benchmarker/visualize.py b/json_benchmarks/benchmarker/visualize.py
similarity index 100%
rename from tests/benchmarker/visualize.py
rename to json_benchmarks/benchmarker/visualize.py
diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py
new file mode 100644
index 0000000..404a9f8
--- /dev/null
+++ b/json_benchmarks/core.py
@@ -0,0 +1,248 @@
+"""
+Main definition of the benchmarks
+"""
+import json
+import ubelt as ub
+import scriptconfig as scfg
+from json_benchmarks import benchmarker
+from json_benchmarks import datagen
+
+KNOWN_LIBRARIES = [
+    "ujson",
+    "nujson",
+    "orjson",
+    "simplejson",
+    "json",
+]
+
+
+class JSONBenchmarkConfig(scfg.Config):
+    """
+    Benchmark JSON implementations
+    """
+    default = {
+        'disable': scfg.Value([], choices=KNOWN_LIBRARIES, help=ub.paragraph(
+            '''
+            Remove specified libraries from the benchmarks
+            '''
+        )),
+
+        'factor': scfg.Value(1.0, help=ub.paragraph(
+            '''
+            Specify as a fraction to speed up benchmarks for development /
+            testing
+            ''')),
+
+        'cache_dir': scfg.Value(None, help=ub.paragraph(
+            '''
+            Location for benchmark cache.
+            Defaults to $XDG_CACHE/ujson/benchmark_results/
+            ''')),
+    }
+
+    def normalize(self):
+        dpath = self['cache_dir']
+        if dpath is None:
+            dpath = ub.Path.appdir('ujson/benchmark_results')
+        dpath = ub.Path(dpath)
+        self['cache_dir'] = dpath
+
+
+def available_json_impls():
+    import importlib
+    known_modnames = [
+        'ujson', 'json', 'nujson', 'orjson', 'simplejson'
+    ]
+    json_impls = {}
+    for libname in known_modnames:
+        try:
+            module = importlib.import_module(libname)
+        except ImportError:
+            pass
+        else:
+            json_impls[libname] = {
+                'module': module,
+                'version': module.__version__,
+            }
+    return json_impls
+
+
+def benchmark_json():
+    json_impls = available_json_impls()
+
+    data_lut = datagen.json_test_data_generators()
+
+    # These are the parameters that we benchmark over
+    basis = {
+        "input": [
+            'Array with doubles',
+            'Array with UTF-8 strings',
+            # 'Medium complex object',
+            'Array with True values',
+            'Array of Dict[str, int]',
+            # 'Dict of List[Dict[str, int]]',
+            # 'Complex object'
+        ],
+        "size": [1, 2, 4, 8, 16, 32, 128, 256, 512, 1024, 2048, 4096, 8192, 12288],
+        "impl": list(json_impls.keys()),
+        "func": ['dumps', 'loads'],
+    }
+
+    # The Benchmarker class is a new experimental API around timerit to
+    # abstract away the details of timing a process over a grid of parameters,
+    # serializing the results, and aggregating results from disparate runs.
+    benchmark = benchmarker.Benchmarker(
+        name='bench_json',
+        num=100,
+        bestof=10,
+        verbose=2,
+        basis=basis,
+    )
+
+    # For each variation of your experiment, create a row.
+    for params in benchmark.iter_params():
+        # Make any modifications you need to compute input kwargs for each
+        # method here.
+        impl_info = json_impls[params["impl"]]
+        params["impl_version"] = impl_info['version']
+        module = impl_info['module']
+        if params['func'] == 'dumps':
+            method = module.dumps
+            data = data_lut[params["input"]](params["size"])
+        elif params['func'] == 'loads':
+            method = module.loads
+            to_encode = data_lut[params["input"]](params["size"])
+            data = json.dumps(to_encode)
+        # Timerit will run some user-specified number of loops.
+        # and compute time stats with similar methodology to timeit
+        for timer in benchmark.measure():
+            # Put any setup logic you dont want to time here.
+            # ...
+            with timer:
+                # Put the logic you want to time here
+                method(data)
+
+    dpath = ub.Path.appdir('ujson/benchmark_results').ensuredir()
+    result_fpath = benchmark.dump_in_dpath(dpath)
+    return result_fpath
+
+
+def aggregate_results(result_fpaths):
+    import json
+    results = []
+    for fpath in result_fpaths:
+        data = json.loads(fpath.read_text())
+        for row in data['rows']:
+            result = benchmarker.BenchmarkerResult.load(fpath)
+            results.extend(result.to_result_list())
+
+    RECORD_ALL = 0
+    metric_key = "time" if RECORD_ALL else "mean_time"
+
+    # results = benchmark.result.to_result_list()
+
+    analysis = benchmarker.result_analysis.ResultAnalysis(
+        results,
+        metrics=[metric_key],
+        params=['impl'],
+        metric_objectives={
+            'min_time': 'min',
+            'mean_time': 'min',
+            'time': 'min',
+        })
+    analysis.analysis()
+
+    table = analysis.table
+
+    def aggregate_time_stats(data, group_keys=None):
+        """
+        Given columns interpreted as containing stats, aggregate those stats
+        within each group. For each row, any non-group, non-stat column
+        with consistent values across that columns in the group is kept as-is,
+        otherwise the new column for that row is set to None.
+        """
+        import pandas as pd
+        # Stats groupings
+        stats_cols = [
+            'nobs_time',
+            'std_time',
+            'mean_time',
+            'max_time',
+            'min_time',
+        ]
+        mapper = {c: c.replace('_time', '') for c in stats_cols}
+        unmapper = ub.invert_dict(mapper)
+        non_stats_cols = list(ub.oset(data.columns) - stats_cols)
+        if group_keys is None:
+            group_keys = non_stats_cols
+        non_group_keys = list(ub.oset(non_stats_cols) - group_keys)
+        from json_benchmarks.benchmarker.benchmarker import combine_stats_arrs
+        new_rows = []
+        for group_vals, group in list(data.groupby(group_keys)):
+            # hack, is this a pandas bug in 1.4.1? Is it fixed
+            if isinstance(group_keys, list) and not isinstance(group_vals, list):
+                group_vals = [group_vals]
+            stat_data = group[stats_cols].rename(mapper, axis=1)
+            new_stats = combine_stats_arrs(stat_data)
+            new_time_stats = ub.map_keys(unmapper, new_stats)
+            new_row = ub.dzip(group_keys, group_vals)
+            if non_group_keys:
+                for k in non_group_keys:
+                    unique_vals = group[k].unique()
+                    if len(unique_vals) == 1:
+                        new_row[k] = unique_vals[0]
+                    else:
+                        new_row[k] = None
+            new_row.update(new_time_stats)
+            new_rows.append(new_row)
+        new_data = pd.DataFrame(new_rows)
+        return new_data
+
+    single_size = table[table['size'] == 256]
+    # single_size_combo = aggregate_time_stats(single_size, None)
+    single_size_combo = aggregate_time_stats(single_size, ['name'])
+
+    param_group = ['impl', 'impl_version']
+    single_size_combo['calls/sec'] = 1 / single_size_combo['mean_time']
+    _single_size_combo = single_size_combo.copy()
+    _single_size_combo['calls/sec'] = _single_size_combo['calls/sec'].apply(lambda x: '{:,.02f}'.format(x))
+    piv = _single_size_combo.pivot('input', param_group, 'calls/sec')
+    print(piv)
+
+    analysis.abalate(param_group)
+    # benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL)
+
+    xlabel = "size"
+    # Set these to empty lists if they are not used
+    group_labels = {
+        "fig": ["input"],
+        "hue": ["impl", "impl_version"],
+        "size": [],
+    }
+    import kwplot
+    kwplot.autosns()
+    plots = analysis.plot(xlabel, metric_key, group_labels)
+    for plot in plots:
+        for ax in plot['facet'].axes.ravel():
+            ax.set_xscale('log')
+            ax.set_yscale('log')
+    kwplot.show_if_requested()
+
+
+def main():
+    from json_benchmarks import core
+    config = core.JSONBenchmarkConfig(cmdline=True)
+    dpath = config['cache_dir']
+
+    run = 1
+    if run:
+        result_fpath = core.benchmark_json()
+        print('result_fpath = {!r}'.format(result_fpath))
+        result_fpaths = [result_fpath]
+
+    agg = 1
+    if agg:
+        result_fpaths = list(dpath.glob('benchmarks*.json'))
+
+    core.aggregate_results(result_fpaths)
+    # results_output_table(libraries)
diff --git a/json_benchmarks/datagen.py b/json_benchmarks/datagen.py
new file mode 100644
index 0000000..ff27d6c
--- /dev/null
+++ b/json_benchmarks/datagen.py
@@ -0,0 +1,115 @@
+import random
+import sys
+
+
+def json_test_data_generators():
+    """
+    Generates data for benchmarks with various sizes
+
+    Returns:
+        Dict[str, callable]:
+            a mapping from test data name to its generator
+
+    Example:
+        >>> data_lut = json_test_data_generators()
+        >>> size = 2
+        >>> keys = sorted(set(data_lut) - {'Complex object'})
+        >>> for key in keys:
+        >>>     func = data_lut[key]
+        >>>     test_object = func(size)
+        >>>     print('key = {!r}'.format(key))
+        >>>     print('test_object = {!r}'.format(test_object))
+    """
+    data_lut = {}
+    def _register_data(name):
+        def _wrap(func):
+            data_lut[name] = func
+        return _wrap
+
+    # seed if desired
+    # rng = random.Random(0)
+    rng = random
+
+    @_register_data('Array with doubles')
+    def array_with_doubles(size):
+        test_object = [sys.maxsize * rng.random() for _ in range(size)]
+        return test_object
+
+    @_register_data('Array with UTF-8 strings')
+    def array_with_utf8_strings(size):
+        utf8_string = (
+            "نظام الحكم سلطاني وراثي "
+            "في الذكور من ذرية السيد تركي بن سعيد بن سلطان ويشترط فيمن يختار لولاية"
+            " الحكم من بينهم ان يكون مسلما رشيدا عاقلا ًوابنا شرعيا لابوين عمانيين "
+        )
+        test_object = [utf8_string for _ in range(size)]
+        return test_object
+
+    @_register_data('Medium complex object')
+    def medium_complex_object(size):
+        user = {
+            "userId": 3381293,
+            "age": 213,
+            "username": "johndoe",
+            "fullname": "John Doe the Second",
+            "isAuthorized": True,
+            "liked": 31231.31231202,
+            "approval": 31.1471,
+            "jobs": [1, 2],
+            "currJob": None,
+        }
+        friends = [user, user, user, user, user, user, user, user]
+        test_object = [[user, friends] for _ in range(size)]
+        return test_object
+
+    @_register_data('Array with True values')
+    def true_values(size):
+        test_object = [True for _ in range(size)]
+        return test_object
+
+    @_register_data('Array of Dict[str, int]')
+    def array_of_dict_string_int(size):
+        test_object = [
+            {str(rng.random() * 20): int(rng.random() * 1000000)}
+            for _ in range(size)
+        ]
+        return test_object
+
+    @_register_data('Dict of List[Dict[str, int]]')
+    def dict_of_list_dict_str_int(size):
+        keys = set()
+        while len(keys) < size:
+            key = str(rng.random() * 20)
+            keys.add(key)
+        test_object = {
+            key: [
+                {str(rng.random() * 20): int(rng.random() * 1000000)}
+                for _ in range(256)
+            ]
+            for key in keys
+        }
+        return test_object
+
+    @_register_data('Complex object')
+    def complex_object(size):
+        import json
+        # TODO: might be better to reigster this file with setup.py or
+        # download it via some mechanism
+        try:
+            dpath = ub.Path(__file__).parent
+            fpath = dpath / 'sample.json'
+            if not fpath.exists():
+                raise Exception
+        except Exception:
+            import ujson
+            dpath = ub.Path(ujson.__file__).parent / 'tests'
+            fpath = dpath / 'sample.json'
+            if not fpath.exists():
+                raise Exception
+        with open(fpath, 'r') as f:
+            test_object = json.load(f)
+        if size > 1:
+            test_object = [test_object] * size
+        return test_object
+
+    return data_lut
diff --git a/tests/benchmark3.py b/tests/benchmark3.py
deleted file mode 100644
index 2563d4d..0000000
--- a/tests/benchmark3.py
+++ /dev/null
@@ -1,244 +0,0 @@
-"""
-Roadmap:
-
-    - [ ]
-"""
-import random
-import sys
-import ubelt as ub
-
-
-def json_test_data_generators():
-    """
-    Generates data for benchmarks with various sizes
-
-    Returns:
-        Dict[str, callable]:
-            a mapping from test data name to its generator
-
-    Example:
-        >>> data_lut = json_test_data_generators()
-        >>> size = 2
-        >>> keys = sorted(set(data_lut) - {'Complex object'})
-        >>> for key in keys:
-        >>>     func = data_lut[key]
-        >>>     test_object = func(size)
-        >>>     print('key = {!r}'.format(key))
-        >>>     print('test_object = {!r}'.format(test_object))
-    """
-    data_lut = {}
-    def _register_data(name):
-        def _wrap(func):
-            data_lut[name] = func
-        return _wrap
-
-    # seed if desired
-    #rng = random.Random()
-    rng = random
-
-    @_register_data('Array with doubles')
-    def array_with_doubles(size):
-        test_object = [sys.maxsize * rng.random() for _ in range(size)]
-        return test_object
-
-    @_register_data('Array with UTF-8 strings')
-    def array_with_utf8_strings(size):
-        utf8_string = (
-            "نظام الحكم سلطاني وراثي "
-            "في الذكور من ذرية السيد تركي بن سعيد بن سلطان ويشترط فيمن يختار لولاية"
-            " الحكم من بينهم ان يكون مسلما رشيدا عاقلا ًوابنا شرعيا لابوين عمانيين "
-        )
-        test_object = [utf8_string for _ in range(size)]
-        return test_object
-
-    @_register_data('Medium complex object')
-    def medium_complex_object(size):
-        user = {
-            "userId": 3381293,
-            "age": 213,
-            "username": "johndoe",
-            "fullname": "John Doe the Second",
-            "isAuthorized": True,
-            "liked": 31231.31231202,
-            "approval": 31.1471,
-            "jobs": [1, 2],
-            "currJob": None,
-        }
-        friends = [user, user, user, user, user, user, user, user]
-        test_object = [[user, friends] for _ in range(size)]
-        return test_object
-
-    @_register_data('Array with True values')
-    def true_values(size):
-        test_object = [True for _ in range(size)]
-        return test_object
-
-    @_register_data('Array of Dict[str, int]')
-    def array_of_dict_string_int(size):
-        test_object = [
-            {str(rng.random() * 20): int(rng.random() * 1000000)}
-            for _ in range(size)
-        ]
-        return test_object
-
-    @_register_data('Dict of List[Dict[str, int]]')
-    def dict_of_list_dict_str_int(size):
-        keys = set()
-        while len(keys) < size:
-            key = str(rng.random() * 20)
-            keys.add(key)
-        test_object = {
-            key: [
-                {str(rng.random() * 20): int(rng.random() * 1000000)}
-                for _ in range(256)
-            ]
-            for key in keys
-        }
-        return test_object
-
-    @_register_data('Complex object')
-    def complex_object(size):
-        import json
-        # TODO: might be better to reigster this file with setup.py or
-        # download it via some mechanism
-        try:
-            dpath = ub.Path(__file__).parent
-            fpath = dpath / 'sample.json'
-            if not fpath.exists():
-                raise Exception
-        except Exception:
-            import ujson
-            dpath = ub.Path(ujson.__file__).parent / 'tests'
-            fpath = dpath / 'sample.json'
-            if not fpath.exists():
-                raise Exception
-        with open(fpath, 'r') as f:
-            test_object = json.load(f)
-        if size > 1:
-            test_object = [test_object] * size
-        return test_object
-
-    return data_lut
-
-
-def available_json_impls():
-    import importlib
-    known_modnames = [
-        'ujson', 'json', 'nujson', 'orjson', 'simplejson'
-    ]
-    json_impls = {}
-    for libname in known_modnames:
-        try:
-            module = importlib.import_module(libname)
-        except ImportError:
-            pass
-        else:
-            json_impls[libname] = {
-                'module': module,
-                'version': module.__version__,
-            }
-    return json_impls
-
-
-def benchmark_json_dumps():
-    # TODO: remove this hack
-    sys.path.append(ub.expandpath('~/code/ultrajson/tests'))
-    from benchmarker import Benchmarker
-
-    json_impls = available_json_impls()
-    data_lut = json_test_data_generators()
-
-    list(data_lut.keys())
-
-    # These are the parameters that we benchmark over
-    basis = {
-        "input": [
-            'Array with doubles',
-            'Array with UTF-8 strings',
-            # 'Medium complex object',
-            'Array with True values',
-            'Array of Dict[str, int]',
-            # 'Dict of List[Dict[str, int]]',
-            # 'Complex object'
-        ],
-        "size": [1, 32, 256, 1024, 2048],
-        "impl": list(json_impls.keys()),
-    }
-
-    # The Benchmarker class is a new experimental API around timerit to
-    # abstract away the details of timing a process over a grid of parameters,
-    # serializing the results, and aggregating results from disparate runs.
-    benchmark = Benchmarker(
-        name='bench_json_dumps',
-        num=100,
-        bestof=10,
-        verbose=2,
-        basis=basis,
-    )
-
-    # For each variation of your experiment, create a row.
-    for params in benchmark.iter_params():
-        # Make any modifications you need to compute input kwargs for each
-        # method here.
-        impl_info = json_impls[params["impl"]]
-        method = impl_info['module'].dumps
-        impl_version = impl_info['version']
-        params["impl_version"] = impl_version
-        data = data_lut[params["input"]](params["size"])
-        # Timerit will run some user-specified number of loops.
-        # and compute time stats with similar methodology to timeit
-        for timer in benchmark.measure():
-            # Put any setup logic you dont want to time here.
-            # ...
-            with timer:
-                # Put the logic you want to time here
-                method(data)
-
-    dpath = ub.Path.appdir('ujson/benchmark_results').ensuredir()
-    benchmark.dump_in_dpath(dpath)
-
-    RECORD_ALL = 0
-    metric_key = "time" if RECORD_ALL else "mean_time"
-
-    from benchmarker import result_analysis
-    results = benchmark.result.to_result_list()
-
-    analysis = result_analysis.ResultAnalysis(
-        results,
-        metrics=[metric_key],
-        params=['impl'],
-        metric_objectives={
-            'min_time': 'min',
-            'mean_time': 'min',
-            'time': 'min',
-        })
-    analysis.analysis()
-    analysis.table
-
-    param_group = ['impl', 'impl_version']
-    analysis.abalate(param_group)
-    # benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL)
-
-    xlabel = "size"
-    # Set these to empty lists if they are not used
-    group_labels = {
-        "col": ["input"],
-        "hue": ["impl"],
-        "size": [],
-    }
-    import kwplot
-    kwplot.autompl()
-    facet = analysis.plot(xlabel, metric_key, group_labels)
-    for ax in facet.axes.ravel():
-        ax.set_xscale('log')
-        ax.set_yscale('log')
-    print('facet = {!r}'.format(facet))
-    kwplot.show_if_requested()
-
-
-if __name__ == "__main__":
-    """
-    CommandLine:
-        python ~/code/ultrajson/tests/benchmark3.py --show
-    """
-    benchmark_json_dumps()
diff --git a/tests/benchmarker/__init__.py b/tests/benchmarker/__init__.py
deleted file mode 100644
index 1d04095..0000000
--- a/tests/benchmarker/__init__.py
+++ /dev/null
@@ -1,35 +0,0 @@
-"""
-A helper module for executing, serializing, combining, and comparing benchmarks
-"""
-
-__mkinit__ = """
-# Autogenerate this file
-mkinit ~/code/ultrajson/tests/benchmarker/__init__.py -w
-"""
-
-__version__ = '0.1.0'
-
-from benchmarker import aggregate
-from benchmarker import benchmarker
-from benchmarker import process_context
-from benchmarker import result_analysis
-from benchmarker import util_json
-from benchmarker import visualize
-
-from benchmarker.aggregate import (demo, demo_data,)
-from benchmarker.benchmarker import (Benchmarker, BenchmarkerConfig,
-                                     BenchmarkerResult, combine_stats,
-                                     stats_dict,)
-from benchmarker.process_context import (ProcessContext,)
-from benchmarker.result_analysis import (Result, ResultAnalysis, SkillTracker,)
-from benchmarker.util_json import (ensure_json_serializable,
-                                   find_json_unserializable,
-                                   indexable_allclose,)
-from benchmarker.visualize import (benchmark_analysis,)
-
-__all__ = ['Benchmarker', 'BenchmarkerConfig', 'BenchmarkerResult',
-           'ProcessContext', 'Result', 'ResultAnalysis', 'SkillTracker',
-           'aggregate', 'benchmark_analysis', 'benchmarker', 'combine_stats',
-           'demo', 'demo_data', 'ensure_json_serializable',
-           'find_json_unserializable', 'indexable_allclose', 'process_context',
-           'result_analysis', 'stats_dict', 'util_json', 'visualize']

From a89bc27ff518ccf5f5c878dc649fa9d9533fec1e Mon Sep 17 00:00:00 2001
From: joncrall <jon.crall@kitware.com>
Date: Thu, 26 May 2022 16:27:26 -0400
Subject: [PATCH 06/25] add support for loads and complex object bench

---
 json_benchmarks/benchmarker/benchmarker.py    |  9 ++++-
 .../benchmarker/result_analysis.py            |  4 +--
 json_benchmarks/core.py                       | 36 +++++++++++++++----
 json_benchmarks/datagen.py                    |  3 +-
 4 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/json_benchmarks/benchmarker/benchmarker.py b/json_benchmarks/benchmarker/benchmarker.py
index 6ff05d5..c59a28f 100644
--- a/json_benchmarks/benchmarker/benchmarker.py
+++ b/json_benchmarks/benchmarker/benchmarker.py
@@ -121,7 +121,14 @@ class Benchmarker:
 
     def iter_params(self):
         self.context.start()
-        grid_iter = list(ub.named_product(self.basis))
+        if isinstance(self.basis, dict):
+            grid_iter = ub.named_product(self.basis)
+        else:
+            grid_iter = ub.flatten([
+                ub.named_product(b)
+                for b in self.basis
+            ])
+
         for params in grid_iter:
             self.params = params
             self.key = ub.repr2(params, compact=1, si=1)
diff --git a/json_benchmarks/benchmarker/result_analysis.py b/json_benchmarks/benchmarker/result_analysis.py
index e07d027..5decbea 100644
--- a/json_benchmarks/benchmarker/result_analysis.py
+++ b/json_benchmarks/benchmarker/result_analysis.py
@@ -776,8 +776,8 @@ class ResultAnalysis(ub.NiceRepr):
         fig_params = plot_kws.pop("fig", [])
 
         facet_kws = {
-            'sharex': False,
-            'sharey': False,
+            'sharex': True,
+            'sharey': True,
         }
         # facet_kws['col'] = plot_kws.pop("col", None)
         # facet_kws['row'] = plot_kws.pop("row", None)
diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py
index 404a9f8..1bf6e65 100644
--- a/json_benchmarks/core.py
+++ b/json_benchmarks/core.py
@@ -73,7 +73,11 @@ def benchmark_json():
     data_lut = datagen.json_test_data_generators()
 
     # These are the parameters that we benchmark over
-    basis = {
+    common_basis = {
+        "impl": list(json_impls.keys()),
+        "func": ['dumps', 'loads'],
+    }
+    sized_basis = {
         "input": [
             'Array with doubles',
             'Array with UTF-8 strings',
@@ -83,10 +87,20 @@ def benchmark_json():
             # 'Dict of List[Dict[str, int]]',
             # 'Complex object'
         ],
-        "size": [1, 2, 4, 8, 16, 32, 128, 256, 512, 1024, 2048, 4096, 8192, 12288],
-        "impl": list(json_impls.keys()),
-        "func": ['dumps', 'loads'],
+        "size": [1, 2, 4, 8, 16, 32, 128, 256, 512],
+        # 1024, 2048, 4096, 8192, 12288],
     }
+    predefined_basis = {
+        "input": [
+            'Complex object'
+        ],
+        'size': [None],
+    }
+
+    basis = [
+        ub.dict_union(common_basis, predefined_basis),
+        ub.dict_union(common_basis, sized_basis),
+    ]
 
     # The Benchmarker class is a new experimental API around timerit to
     # abstract away the details of timing a process over a grid of parameters,
@@ -95,12 +109,18 @@ def benchmark_json():
         name='bench_json',
         num=100,
         bestof=10,
-        verbose=2,
+        verbose=3,
         basis=basis,
     )
 
+    def is_blocked(params):
+        if params['input'] == 'Complex object' and params['impl'] == 'orjson':
+            return True
+
     # For each variation of your experiment, create a row.
     for params in benchmark.iter_params():
+        if is_blocked(params):
+            continue
         # Make any modifications you need to compute input kwargs for each
         # method here.
         impl_info = json_impls[params["impl"]]
@@ -198,7 +218,7 @@ def aggregate_results(result_fpaths):
         new_data = pd.DataFrame(new_rows)
         return new_data
 
-    single_size = table[table['size'] == 256]
+    single_size = table[(table['size'] == 256) | table['size'].isnull()]
     # single_size_combo = aggregate_time_stats(single_size, None)
     single_size_combo = aggregate_time_stats(single_size, ['name'])
 
@@ -206,7 +226,8 @@ def aggregate_results(result_fpaths):
     single_size_combo['calls/sec'] = 1 / single_size_combo['mean_time']
     _single_size_combo = single_size_combo.copy()
     _single_size_combo['calls/sec'] = _single_size_combo['calls/sec'].apply(lambda x: '{:,.02f}'.format(x))
-    piv = _single_size_combo.pivot('input', param_group, 'calls/sec')
+    piv = _single_size_combo.pivot(['input', 'func'], param_group, 'calls/sec')
+    print('Table for size=256')
     print(piv)
 
     analysis.abalate(param_group)
@@ -216,6 +237,7 @@ def aggregate_results(result_fpaths):
     # Set these to empty lists if they are not used
     group_labels = {
         "fig": ["input"],
+        "col": ["func"],
         "hue": ["impl", "impl_version"],
         "size": [],
     }
diff --git a/json_benchmarks/datagen.py b/json_benchmarks/datagen.py
index ff27d6c..afb2708 100644
--- a/json_benchmarks/datagen.py
+++ b/json_benchmarks/datagen.py
@@ -1,5 +1,6 @@
 import random
 import sys
+import ubelt as ub
 
 
 def json_test_data_generators():
@@ -108,7 +109,7 @@ def json_test_data_generators():
                 raise Exception
         with open(fpath, 'r') as f:
             test_object = json.load(f)
-        if size > 1:
+        if size is not None:
             test_object = [test_object] * size
         return test_object
 

From 4d0f705d6d24f71a9bc0af0812ad7d6d51faa5c5 Mon Sep 17 00:00:00 2001
From: joncrall <jon.crall@kitware.com>
Date: Fri, 27 May 2022 10:15:48 -0400
Subject: [PATCH 07/25] wip

---
 json_benchmarks/benchmarker/result_analysis.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/json_benchmarks/benchmarker/result_analysis.py b/json_benchmarks/benchmarker/result_analysis.py
index 5decbea..2b270a5 100644
--- a/json_benchmarks/benchmarker/result_analysis.py
+++ b/json_benchmarks/benchmarker/result_analysis.py
@@ -795,7 +795,10 @@ class ResultAnalysis(ub.NiceRepr):
         if 'marker' not in plot_kws:
             plot_kws['marker'] = "o"
 
+        # We will want to overwrite this with our own std estimate
         plot_kws['ci'] = "sd"
+        # err_style='band',
+        # err_kws=None,
 
         # Use a consistent pallete across plots
         unique_hues = data['hue_key'].unique()
@@ -814,7 +817,9 @@ class ResultAnalysis(ub.NiceRepr):
             # fig.clf()
 
             facet = sns.relplot(
-                data=group, kind='line',
+                data=group,
+                # kind='line',
+                kind='scatter',
                 facet_kws=facet_kws,
                 **plot_kws)
 

From 3159c0088990f3369bb99e062df15986952fd9dd Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 27 May 2022 14:19:02 +0000
Subject: [PATCH 08/25] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 json_benchmarks/__main__.py                   |   4 +-
 json_benchmarks/benchmarker/__init__.py       |  80 ++--
 json_benchmarks/benchmarker/aggregate.py      |  52 +--
 json_benchmarks/benchmarker/benchmarker.py    | 107 ++---
 .../benchmarker/process_context.py            |  77 ++--
 .../benchmarker/result_analysis.py            | 379 +++++++++++-------
 json_benchmarks/benchmarker/util_json.py      |  41 +-
 json_benchmarks/benchmarker/visualize.py      |   8 +-
 json_benchmarks/core.py                       | 147 ++++---
 json_benchmarks/datagen.py                    |  30 +-
 10 files changed, 544 insertions(+), 381 deletions(-)

diff --git a/json_benchmarks/__main__.py b/json_benchmarks/__main__.py
index faf03f6..c4ddc30 100644
--- a/json_benchmarks/__main__.py
+++ b/json_benchmarks/__main__.py
@@ -1,8 +1,8 @@
-
-if __name__ == '__main__':
+if __name__ == "__main__":
     """
     CommandLine:
         python -m json_benchmarks
     """
     from json_benchmarks import core
+
     core.main()
diff --git a/json_benchmarks/benchmarker/__init__.py b/json_benchmarks/benchmarker/__init__.py
index 8abb4c5..ee32ea0 100644
--- a/json_benchmarks/benchmarker/__init__.py
+++ b/json_benchmarks/benchmarker/__init__.py
@@ -7,33 +7,59 @@ __mkinit__ = """
 mkinit ~/code/ultrajson/json_benchmarks/benchmarker/__init__.py -w
 """
 
-__version__ = '0.1.0'
+__version__ = "0.1.0"
 
-from json_benchmarks.benchmarker import aggregate
-from json_benchmarks.benchmarker import benchmarker
-from json_benchmarks.benchmarker import process_context
-from json_benchmarks.benchmarker import result_analysis
-from json_benchmarks.benchmarker import util_json
-from json_benchmarks.benchmarker import visualize
-
-from json_benchmarks.benchmarker.aggregate import (demo, demo_data,)
-from json_benchmarks.benchmarker.benchmarker import (Benchmarker,
-                                                     BenchmarkerConfig,
-                                                     BenchmarkerResult,
-                                                     combine_stats,
-                                                     stats_dict,)
-from json_benchmarks.benchmarker.process_context import (ProcessContext,)
+from json_benchmarks.benchmarker import (
+    aggregate,
+    benchmarker,
+    process_context,
+    result_analysis,
+    util_json,
+    visualize,
+)
+from json_benchmarks.benchmarker.aggregate import demo, demo_data
+from json_benchmarks.benchmarker.benchmarker import (
+    Benchmarker,
+    BenchmarkerConfig,
+    BenchmarkerResult,
+    combine_stats,
+    stats_dict,
+)
+from json_benchmarks.benchmarker.process_context import ProcessContext
 from json_benchmarks.benchmarker.result_analysis import (
-    DEFAULT_METRIC_TO_OBJECTIVE, Result, ResultAnalysis, SkillTracker,)
-from json_benchmarks.benchmarker.util_json import (ensure_json_serializable,
-                                                   find_json_unserializable,
-                                                   indexable_allclose,)
-from json_benchmarks.benchmarker.visualize import (benchmark_analysis,)
+    DEFAULT_METRIC_TO_OBJECTIVE,
+    Result,
+    ResultAnalysis,
+    SkillTracker,
+)
+from json_benchmarks.benchmarker.util_json import (
+    ensure_json_serializable,
+    find_json_unserializable,
+    indexable_allclose,
+)
+from json_benchmarks.benchmarker.visualize import benchmark_analysis
 
-__all__ = ['Benchmarker', 'BenchmarkerConfig', 'BenchmarkerResult',
-           'DEFAULT_METRIC_TO_OBJECTIVE', 'ProcessContext', 'Result',
-           'ResultAnalysis', 'SkillTracker', 'aggregate', 'benchmark_analysis',
-           'benchmarker', 'combine_stats', 'demo', 'demo_data',
-           'ensure_json_serializable', 'find_json_unserializable',
-           'indexable_allclose', 'process_context', 'result_analysis',
-           'stats_dict', 'util_json', 'visualize']
+__all__ = [
+    "Benchmarker",
+    "BenchmarkerConfig",
+    "BenchmarkerResult",
+    "DEFAULT_METRIC_TO_OBJECTIVE",
+    "ProcessContext",
+    "Result",
+    "ResultAnalysis",
+    "SkillTracker",
+    "aggregate",
+    "benchmark_analysis",
+    "benchmarker",
+    "combine_stats",
+    "demo",
+    "demo_data",
+    "ensure_json_serializable",
+    "find_json_unserializable",
+    "indexable_allclose",
+    "process_context",
+    "result_analysis",
+    "stats_dict",
+    "util_json",
+    "visualize",
+]
diff --git a/json_benchmarks/benchmarker/aggregate.py b/json_benchmarks/benchmarker/aggregate.py
index b2d74c9..bba5771 100644
--- a/json_benchmarks/benchmarker/aggregate.py
+++ b/json_benchmarks/benchmarker/aggregate.py
@@ -1,31 +1,36 @@
 import json
+
 import pandas as pd
 import ubelt as ub
 
 
 def demo_data():
-    from json_benchmarks.benchmarker.benchmarker import Benchmarker
     import numpy as np
+
+    from json_benchmarks.benchmarker.benchmarker import Benchmarker
+
     impl_lut = {
-        'numpy': np.sum,
-        'builtin': sum,
-    }
-    def data_lut(params):
-        item = 42 if params['dtype'] == 'int' else 42.0
-        data = [item] * params['size']
-        return data
-    basis = {
-        'impl': ['builtin', 'numpy'],
-        'size': [10, 10000],
-        'dtype': ['int', 'float'],
+        "numpy": np.sum,
+        "builtin": sum,
     }
 
-    dpath = ub.Path.appdir('benchmarker/agg_demo').delete().ensuredir()
+    def data_lut(params):
+        item = 42 if params["dtype"] == "int" else 42.0
+        data = [item] * params["size"]
+        return data
+
+    basis = {
+        "impl": ["builtin", "numpy"],
+        "size": [10, 10000],
+        "dtype": ["int", "float"],
+    }
+
+    dpath = ub.Path.appdir("benchmarker/agg_demo").delete().ensuredir()
 
     def run_one_benchmark():
-        self = Benchmarker(name='agg_demo', num=10, bestof=3, basis=basis)
+        self = Benchmarker(name="agg_demo", num=10, bestof=3, basis=basis)
         for params in self.iter_params():
-            impl = impl_lut[params['impl']]
+            impl = impl_lut[params["impl"]]
             data = data_lut(params)
             for timer in self.measure():
                 with timer:
@@ -43,25 +48,26 @@ def demo_data():
 
 
 def demo():
-    from json_benchmarks.benchmarker import BenchmarkerResult
-    from json_benchmarks.benchmarker import result_analysis
+    from json_benchmarks.benchmarker import BenchmarkerResult, result_analysis
+
     fpaths = demo_data()
 
     results = []
     for fpath in fpaths:
         data = json.loads(fpath.read_text())
-        for row in data['rows']:
+        for row in data["rows"]:
             result = BenchmarkerResult.load(fpath)
             results.extend(result.to_result_list())
 
     analysis = result_analysis.ResultAnalysis(
         results,
-        metrics=['min', 'mean'],
-        params=['impl'],
+        metrics=["min", "mean"],
+        params=["impl"],
         metric_objectives={
-            'min': 'min',
-            'mean': 'min',
-        })
+            "min": "min",
+            "mean": "min",
+        },
+    )
     analysis.analysis()
     # single_df = pd.DataFrame(data['rows'])
     # context = data['context']
diff --git a/json_benchmarks/benchmarker/benchmarker.py b/json_benchmarks/benchmarker/benchmarker.py
index c59a28f..008ba82 100644
--- a/json_benchmarks/benchmarker/benchmarker.py
+++ b/json_benchmarks/benchmarker/benchmarker.py
@@ -1,43 +1,46 @@
 import json
+from dataclasses import dataclass
+
+import numpy as np
 import timerit
 import ubelt as ub
-import numpy as np
-from dataclasses import dataclass
+
 from json_benchmarks.benchmarker.process_context import ProcessContext
 
 
 @dataclass
 class BenchmarkerConfig:
-    name    : str = None
-    num     : int = 100
-    bestof  : int = 10
+    name: str = None
+    num: int = 100
+    bestof: int = 10
 
 
 class BenchmarkerResult:
     """
     Serialization for a single benchmark result
     """
+
     def __init__(self, context, rows):
         self.context = context
         self.rows = rows
 
     def __json__(self):
         data = {
-            'type': 'benchmark_result',
-            'context': self.context,
-            'rows': self.rows,
+            "type": "benchmark_result",
+            "context": self.context,
+            "rows": self.rows,
         }
         return data
 
     @classmethod
     def from_json(cls, data):
-        assert data['type'] == 'benchmark_result'
-        self = cls(data['context'], data['rows'])
+        assert data["type"] == "benchmark_result"
+        self = cls(data["context"], data["rows"])
         return self
 
     @classmethod
     def load(cls, fpath):
-        with open(fpath, 'r') as file:
+        with open(fpath) as file:
             data = json.load(file)
         self = cls.from_json(data)
         return self
@@ -50,14 +53,15 @@ class BenchmarkerResult:
             List[Result]
         """
         from json_benchmarks.benchmarker import result_analysis
+
         results = []
         for row in self.rows:
             result = result_analysis.Result(
-                name=row['name'],
-                metrics=row['metrics'],
-                params=row['params'].copy(),
+                name=row["name"],
+                metrics=row["metrics"],
+                params=row["params"].copy(),
             )
-            machine = self.context['machine']
+            machine = self.context["machine"]
             assert not ub.dict_isect(result.params, machine)
             result.params.update(machine)
             results.append(result)
@@ -94,6 +98,7 @@ class Benchmarker:
         >>> dpath = ub.Path.appdir('benchmarker/demo').ensuredir()
         >>> self.dump_in_dpath(dpath)
     """
+
     def __init__(self, basis={}, verbose=1, **kwargs):
         self.basis = basis
 
@@ -111,11 +116,11 @@ class Benchmarker:
 
     def dump_in_dpath(self, dpath):
         dpath = ub.Path(dpath)
-        timestamp = self.context.obj['stop_timestamp']
-        fname = f'benchmarks_{self.config.name}_{timestamp}.json'
+        timestamp = self.context.obj["stop_timestamp"]
+        fname = f"benchmarks_{self.config.name}_{timestamp}.json"
         fpath = dpath / fname
 
-        with open(fpath, 'w') as file:
+        with open(fpath, "w") as file:
             json.dump(self.result.__json__(), file)
         return fpath
 
@@ -124,10 +129,7 @@ class Benchmarker:
         if isinstance(self.basis, dict):
             grid_iter = ub.named_product(self.basis)
         else:
-            grid_iter = ub.flatten([
-                ub.named_product(b)
-                for b in self.basis
-            ])
+            grid_iter = ub.flatten([ub.named_product(b) for b in self.basis])
 
         for params in grid_iter:
             self.params = params
@@ -137,8 +139,7 @@ class Benchmarker:
         self.result = BenchmarkerResult(obj, self.rows)
 
     def measure(self):
-        for timer in self.ti.reset(self.key):
-            yield timer
+        yield from self.ti.reset(self.key)
 
         rows = self.rows
         ti = self.ti
@@ -151,29 +152,29 @@ class Benchmarker:
                     "time": time,
                 }
                 row = {
-                    'name': key,
-                    'metrics': metrics,
-                    'params': params,
+                    "name": key,
+                    "metrics": metrics,
+                    "params": params,
                 }
                 rows.append(row)
         else:
             times = np.array(ti.robust_times())
-            metrics = stats_dict(times, '_time')
+            metrics = stats_dict(times, "_time")
             row = {
-                'metrics': metrics,
-                'params': params,
-                'name': key,
+                "metrics": metrics,
+                "params": params,
+                "name": key,
             }
             rows.append(row)
 
 
-def stats_dict(data, suffix=''):
+def stats_dict(data, suffix=""):
     stats = {
-        'nobs' + suffix: len(data),
-        'mean' + suffix: data.mean(),
-        'std' + suffix: data.std(),
-        'min' + suffix: data.min(),
-        'max' + suffix: data.max(),
+        "nobs" + suffix: len(data),
+        "mean" + suffix: data.mean(),
+        "std" + suffix: data.std(),
+        "min" + suffix: data.min(),
+        "max" + suffix: data.max(),
     }
     return stats
 
@@ -210,27 +211,27 @@ def combine_stats(s1, s2):
     """
     stats = [s1, s2]
     data = {
-        'nobs': np.array([s['nobs'] for s in stats]),
-        'mean': np.array([s['mean'] for s in stats]),
-        'std': np.array([s['std'] for s in stats]),
-        'min': np.array([s['min'] for s in stats]),
-        'max': np.array([s['max'] for s in stats]),
+        "nobs": np.array([s["nobs"] for s in stats]),
+        "mean": np.array([s["mean"] for s in stats]),
+        "std": np.array([s["std"] for s in stats]),
+        "min": np.array([s["min"] for s in stats]),
+        "max": np.array([s["max"] for s in stats]),
     }
     combine_stats_arrs(data)
 
 
 def combine_stats_arrs(data):
-    sizes = data['nobs']
-    means = data['mean']
-    stds = data['std']
-    mins = data['min']
-    maxs = data['max']
+    sizes = data["nobs"]
+    means = data["mean"]
+    stds = data["std"]
+    mins = data["min"]
+    maxs = data["max"]
     varis = stds * stds
 
     combo_size = sizes.sum()
     combo_mean = (sizes * means).sum() / combo_size
 
-    mean_deltas = (means - combo_mean)
+    mean_deltas = means - combo_mean
 
     sv = (sizes * varis).sum()
     sm = (sizes * (mean_deltas * mean_deltas)).sum()
@@ -238,10 +239,10 @@ def combine_stats_arrs(data):
     combo_std = np.sqrt(combo_vars)
 
     combo_stats = {
-        'nobs': combo_size,
-        'mean': combo_mean,
-        'std': combo_std,
-        'min': mins.min(),
-        'max': maxs.max(),
+        "nobs": combo_size,
+        "mean": combo_mean,
+        "std": combo_std,
+        "min": mins.min(),
+        "max": maxs.max(),
     }
     return combo_stats
diff --git a/json_benchmarks/benchmarker/process_context.py b/json_benchmarks/benchmarker/process_context.py
index bce02c0..a66062a 100644
--- a/json_benchmarks/benchmarker/process_context.py
+++ b/json_benchmarks/benchmarker/process_context.py
@@ -1,8 +1,9 @@
-import ubelt as ub
-import socket
 import platform
+import socket
 import sys
 
+import ubelt as ub
+
 
 class ProcessContext:
     """
@@ -20,59 +21,71 @@ class ProcessContext:
             args = sys.argv
 
         self.obj = {
-            'type': 'process_context',
-            'name': name,
-            'args': args,
-            'config': config,
-            'machine': None,
-            'start_timestamp': None,
-            'stop_timestamp': None,
+            "type": "process_context",
+            "name": name,
+            "args": args,
+            "config": config,
+            "machine": None,
+            "start_timestamp": None,
+            "stop_timestamp": None,
         }
 
     def _timestamp(self):
         import datetime
-        timestamp = datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat()
-        timestamp = timestamp.replace(':', '')
+
+        timestamp = (
+            datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat()
+        )
+        timestamp = timestamp.replace(":", "")
         # timestamp = ub.timestamp()
         return timestamp
 
     def _hostinfo(self):
         return {
-            'host': socket.gethostname(),
-            'user': ub.Path(ub.userhome()).name,
+            "host": socket.gethostname(),
+            "user": ub.Path(ub.userhome()).name,
             # 'cwd': os.getcwd(),
         }
 
     def _osinfo(self):
-        uname_system, _, uname_release, uname_version, _, uname_processor = platform.uname()
+        (
+            uname_system,
+            _,
+            uname_release,
+            uname_version,
+            _,
+            uname_processor,
+        ) = platform.uname()
         return {
-            'os_name': uname_system,
-            'os_release': uname_release,
-            'os_version': uname_version,
-            'arch': uname_processor,
+            "os_name": uname_system,
+            "os_release": uname_release,
+            "os_version": uname_version,
+            "arch": uname_processor,
         }
 
     def _pyinfo(self):
         return {
-            'py_impl': platform.python_implementation(),
-            'py_version': sys.version.replace("\n", ""),
+            "py_impl": platform.python_implementation(),
+            "py_version": sys.version.replace("\n", ""),
         }
 
     def _meminfo(self):
         import psutil
+
         # TODO: could collect memory info at start and stop and intermediate
         # stages.  Here we just want info that is static wrt to the machine.
         # For now, just get the total available.
         svmem_info = psutil.virtual_memory()
         return {
-            'mem_total': svmem_info.total,
+            "mem_total": svmem_info.total,
         }
 
     def _cpuinfo(self):
         import cpuinfo
+
         _cpu_info = cpuinfo.get_cpu_info()
         cpu_info = {
-            'cpu_brand': _cpu_info['brand_raw'],
+            "cpu_brand": _cpu_info["brand_raw"],
         }
         return cpu_info
 
@@ -86,17 +99,21 @@ class ProcessContext:
         )
 
     def start(self):
-        self.obj.update({
-            'machine': self._machine(),
-            'start_timestamp': self._timestamp(),
-            'stop_timestamp': None,
-        })
+        self.obj.update(
+            {
+                "machine": self._machine(),
+                "start_timestamp": self._timestamp(),
+                "stop_timestamp": None,
+            }
+        )
         return self
 
     def stop(self):
-        self.obj.update({
-            'stop_timestamp': self._timestamp(),
-        })
+        self.obj.update(
+            {
+                "stop_timestamp": self._timestamp(),
+            }
+        )
         return self.obj
 
     def __enter__(self):
diff --git a/json_benchmarks/benchmarker/result_analysis.py b/json_benchmarks/benchmarker/result_analysis.py
index 2b270a5..944d85e 100644
--- a/json_benchmarks/benchmarker/result_analysis.py
+++ b/json_benchmarks/benchmarker/result_analysis.py
@@ -1,23 +1,23 @@
 import itertools as it
 import math
+import warnings
+
 import numpy as np
 import pandas as pd
-import ubelt as ub
-import warnings
 import scipy
 import scipy.stats  # NOQA
-
+import ubelt as ub
 
 # a list of common objectives
 DEFAULT_METRIC_TO_OBJECTIVE = {
-    'time': 'min',
-    'ap': 'max',
-    'acc': 'max',
-    'f1': 'max',
-    'mcc': 'max',
+    "time": "min",
+    "ap": "max",
+    "acc": "max",
+    "f1": "max",
+    "mcc": "max",
     #
-    'loss': 'min',
-    'brier': 'min',
+    "loss": "min",
+    "brier": "min",
 }
 
 
@@ -49,6 +49,7 @@ class Result(ub.NiceRepr):
         >>> self = Result.demo(mode='alt', rng=32)
         >>> print('self = {}'.format(self))
     """
+
     def __init__(self, name, params, metrics, meta=None):
         self.name = name
         self.params = params
@@ -56,7 +57,7 @@ class Result(ub.NiceRepr):
         self.meta = meta
 
     def to_dict(self):
-        row = ub.dict_union({'name': self.name}, self.metrics, self.params)
+        row = ub.dict_union({"name": self.name}, self.metrics, self.params)
         return row
 
     def __nice__(self):
@@ -65,41 +66,44 @@ class Result(ub.NiceRepr):
         return text
 
     @classmethod
-    def demo(cls, mode='null', rng=None):
-        import numpy as np
+    def demo(cls, mode="null", rng=None):
         import string
+
         import kwarray
+        import numpy as np
+
         rng = kwarray.ensure_rng(rng)
 
-        if mode == 'null':
+        if mode == "null":
             # The null hypothesis should generally be true here,
             # there is no relation between the results and parameters
             demo_param_space = {
-                'param1': list(range(3)),
-                'param2': np.linspace(0, 10, 10),
-                'param3': list(string.ascii_lowercase[0:3]),
+                "param1": list(range(3)),
+                "param2": np.linspace(0, 10, 10),
+                "param3": list(string.ascii_lowercase[0:3]),
             }
             params = {k: rng.choice(b) for k, b in demo_param_space.items()}
             metrics = {
-                'f1': rng.rand(),
-                'acc': rng.rand(),
+                "f1": rng.rand(),
+                "acc": rng.rand(),
             }
-        elif mode == 'alt':
+        elif mode == "alt":
             # The alternative hypothesis should be true here, there is a
             # relationship between results two of the params.
             from scipy.special import expit
+
             params = {
-                'u': rng.randint(0, 1 + 1),
-                'v': rng.randint(-1, 1 + 1),
-                'x': rng.randint(-2, 3 + 1),
-                'y': rng.randint(-1, 2 + 1),
-                'z': rng.randint(-0, 3 + 1),
+                "u": rng.randint(0, 1 + 1),
+                "v": rng.randint(-1, 1 + 1),
+                "x": rng.randint(-2, 3 + 1),
+                "y": rng.randint(-1, 2 + 1),
+                "z": rng.randint(-0, 3 + 1),
             }
             noise = np.random.randn() * 1
-            r = 3 * params['x'] + params['y'] ** 2 + 0.3 * params['z'] ** 3
+            r = 3 * params["x"] + params["y"] ** 2 + 0.3 * params["z"] ** 3
             acc = expit(r / 20 + noise)
             metrics = {
-                'acc': acc,
+                "acc": acc,
             }
         else:
             raise KeyError(mode)
@@ -210,10 +214,18 @@ class ResultAnalysis(ub.NiceRepr):
             ttest_ind:  p=0.7626
     """
 
-    def __init__(self, results, metrics=None, params=None, ignore_params=None,
-                 ignore_metrics=None, metric_objectives=None,
-                 abalation_orders={1}, default_objective='max',
-                 p_threshold=0.05):
+    def __init__(
+        self,
+        results,
+        metrics=None,
+        params=None,
+        ignore_params=None,
+        ignore_metrics=None,
+        metric_objectives=None,
+        abalation_orders={1},
+        default_objective="max",
+        p_threshold=0.05,
+    ):
         self.results = results
         if ignore_metrics is None:
             ignore_metrics = set()
@@ -237,21 +249,22 @@ class ResultAnalysis(ub.NiceRepr):
         self.p_threshold = p_threshold
 
         self._description = {}
-        self._description['built'] = False
-        self._description['num_results'] = len(self.results)
+        self._description["built"] = False
+        self._description["num_results"] = len(self.results)
 
     def __nice__(self):
         return ub.repr2(self._description, si=1, sv=1)
 
     @classmethod
-    def demo(cls, num=10, mode='null', rng=None):
+    def demo(cls, num=10, mode="null", rng=None):
         import kwarray
+
         rng = kwarray.ensure_rng(rng)
         results = [Result.demo(mode=mode, rng=rng) for _ in range(num)]
-        if mode == 'null':
-            self = cls(results, metrics={'f1', 'acc'})
+        if mode == "null":
+            self = cls(results, metrics={"f1", "acc"})
         else:
-            self = cls(results, metrics={'acc'})
+            self = cls(results, metrics={"acc"})
         return self
 
     def run(self):
@@ -284,7 +297,8 @@ class ResultAnalysis(ub.NiceRepr):
         # remove nans
         varied = {
             k: {v for v in vs if not (isinstance(v, float) and math.isnan(v))}
-            for k, vs in varied.items()}
+            for k, vs in varied.items()
+        }
         varied = {k: vs for k, vs in varied.items() if len(vs)}
         return varied
 
@@ -339,9 +353,9 @@ class ResultAnalysis(ub.NiceRepr):
         """
         objective = self.metric_objectives.get(metric_key, None)
         if objective is None:
-            warnings.warn(f'warning assume {self.default_objective} for {metric_key=}')
+            warnings.warn(f"warning assume {self.default_objective} for {metric_key=}")
             objective = self.default_objective
-        ascending = (objective == 'min')
+        ascending = objective == "min"
         return ascending
 
     def abalate(self, param_group):
@@ -367,10 +381,13 @@ class ResultAnalysis(ub.NiceRepr):
 
         # For hashable generic dictionary
         from collections import namedtuple
-        gd = namedtuple('config', param_group)
+
+        gd = namedtuple("config", param_group)
 
         # from types import SimpleNamespace
-        param_unique_vals_ = self.table[param_group].drop_duplicates().to_dict('records')
+        param_unique_vals_ = (
+            self.table[param_group].drop_duplicates().to_dict("records")
+        )
         param_unique_vals = [gd(**d) for d in param_unique_vals_]
         # param_unique_vals = {p: self.table[p].unique().tolist() for p in param_group}
         score_improvements = ub.ddict(list)
@@ -405,19 +422,29 @@ class ResultAnalysis(ub.NiceRepr):
                 best_group.set_index(param_group)
                 # best_group[param_group]
                 # best_group[metric_key].diff()
-                scored_ranking = best_group[param_group + [metric_key]].reset_index(drop=True)
+                scored_ranking = best_group[param_group + [metric_key]].reset_index(
+                    drop=True
+                )
                 scored_obs.append(scored_ranking)
-                ranking = [gd(**d) for d in scored_ranking[param_group].to_dict('records')]
+                ranking = [
+                    gd(**d) for d in scored_ranking[param_group].to_dict("records")
+                ]
                 skillboard.observe(ranking)
 
-        print('skillboard.ratings = {}'.format(ub.repr2(skillboard.ratings, nl=1, align=':')))
+        print(
+            "skillboard.ratings = {}".format(
+                ub.repr2(skillboard.ratings, nl=1, align=":")
+            )
+        )
         win_probs = skillboard.predict_win()
-        print('win_probs = {}'.format(ub.repr2(win_probs, nl=1)))
+        print(f"win_probs = {ub.repr2(win_probs, nl=1)}")
         for key, improves in score_improvements.items():
             k1, k2, metric_key = key
             improves = np.array(improves)
             pos_delta = improves[improves > 0]
-            print(f'\nWhen {k1} is better than {k2}, the improvement in {metric_key} is')
+            print(
+                f"\nWhen {k1} is better than {k2}, the improvement in {metric_key} is"
+            )
             print(pd.DataFrame([pd.Series(pos_delta).describe().T]))
         return scored_obs
 
@@ -441,10 +468,10 @@ class ResultAnalysis(ub.NiceRepr):
             >>> stats_row = self.test_group(param_group, metric_key)
             >>> print('stats_row = {}'.format(ub.repr2(stats_row, nl=2, sort=0, precision=2)))
         """
-        param_group_name = ','.join(param_group)
+        param_group_name = ",".join(param_group)
         stats_row = {
-            'param_name': param_group_name,
-            'metric': metric_key,
+            "param_name": param_group_name,
+            "metric": metric_key,
         }
         # param_values = varied[param_name]
         # stats_row['param_values'] = param_values
@@ -463,7 +490,7 @@ class ResultAnalysis(ub.NiceRepr):
         nuisance_cols = sorted(set(self.varied.keys()) - set(param_group))
 
         for param_value, group in self.table.groupby(param_group):
-            metric_group = group[['name', metric_key] + varied_cols]
+            metric_group = group[["name", metric_key] + varied_cols]
             metric_vals = metric_group[metric_key]
             metric_vals = metric_vals.dropna()
             if len(metric_vals) > 0:
@@ -473,18 +500,24 @@ class ResultAnalysis(ub.NiceRepr):
                 value_to_metric[param_value] = metric_vals.values
 
         moments = pd.DataFrame(value_to_metric_stats).T
-        moments = moments.sort_values('mean', ascending=ascending)
+        moments = moments.sort_values("mean", ascending=ascending)
         moments.index.name = param_group_name
         moments.columns.name = metric_key
-        ranking = moments['mean'].index.to_list()
+        ranking = moments["mean"].index.to_list()
         param_to_rank = ub.invert_dict(dict(enumerate(ranking)))
 
         # Determine a set of value pairs to do pairwise comparisons on
         value_pairs = ub.oset()
         # value_pairs.update(
         #     map(frozenset, ub.iter_window(moments.index, 2)))
-        value_pairs.update(map(frozenset, ub.iter_window(
-                moments.sort_values('mean', ascending=ascending).index, 2)))
+        value_pairs.update(
+            map(
+                frozenset,
+                ub.iter_window(
+                    moments.sort_values("mean", ascending=ascending).index, 2
+                ),
+            )
+        )
 
         # https://en.wikipedia.org/wiki/Kruskal%E2%80%93Wallis_one-way_analysis_of_variance
         # If the researcher can make the assumptions of an identically
@@ -508,11 +541,11 @@ class ResultAnalysis(ub.NiceRepr):
         else:
             anova_1way_result = scipy.stats.stats.F_onewayResult(np.nan, np.nan)
 
-        stats_row['anova_rank_H'] = anova_krus_result.statistic
-        stats_row['anova_rank_p'] = anova_krus_result.pvalue
-        stats_row['anova_mean_F'] = anova_1way_result.statistic
-        stats_row['anova_mean_p'] = anova_1way_result.pvalue
-        stats_row['moments'] = moments
+        stats_row["anova_rank_H"] = anova_krus_result.statistic
+        stats_row["anova_rank_p"] = anova_krus_result.pvalue
+        stats_row["anova_mean_F"] = anova_1way_result.statistic
+        stats_row["anova_mean_p"] = anova_1way_result.pvalue
+        stats_row["moments"] = moments
 
         pair_stats_list = []
         for pair in value_pairs:
@@ -524,43 +557,50 @@ class ResultAnalysis(ub.NiceRepr):
 
             rank1 = param_to_rank[param_val1]
             rank2 = param_to_rank[param_val2]
-            pair_stats['winner'] = param_val1 if rank1 < rank2 else param_val2
-            pair_stats['value1'] = param_val1
-            pair_stats['value2'] = param_val2
-            pair_stats['n1'] = len(metric_vals1)
-            pair_stats['n2'] = len(metric_vals2)
+            pair_stats["winner"] = param_val1 if rank1 < rank2 else param_val2
+            pair_stats["value1"] = param_val1
+            pair_stats["value2"] = param_val2
+            pair_stats["n1"] = len(metric_vals1)
+            pair_stats["n2"] = len(metric_vals2)
 
             TEST_ONLY_FOR_DIFFERENCE = True
             if TEST_ONLY_FOR_DIFFERENCE:
                 if ascending:
                     # We want to minimize the metric
-                    alternative = 'less' if rank1 < rank2 else 'greater'
+                    alternative = "less" if rank1 < rank2 else "greater"
                 else:
                     # We want to maximize the metric
-                    alternative = 'greater' if rank1 < rank2 else 'less'
+                    alternative = "greater" if rank1 < rank2 else "less"
             else:
-                alternative = 'two-sided'
+                alternative = "two-sided"
 
             ind_kw = dict(
                 equal_var=False,
                 alternative=alternative,
             )
-            ttest_ind_result = scipy.stats.ttest_ind(metric_vals1, metric_vals2, **ind_kw)
+            ttest_ind_result = scipy.stats.ttest_ind(
+                metric_vals1, metric_vals2, **ind_kw
+            )
 
             if 0:
                 from benchmarker.benchmarker import stats_dict
+
                 stats1 = stats_dict(metric_vals1)
                 stats2 = stats_dict(metric_vals2)
                 scipy.stats.ttest_ind_from_stats(
-                    stats1['mean'], stats1['std'], stats1['nobs'],
-                    stats2['mean'], stats2['std'], stats2['nobs'],
-                    **ind_kw
+                    stats1["mean"],
+                    stats1["std"],
+                    stats1["nobs"],
+                    stats2["mean"],
+                    stats2["std"],
+                    stats2["nobs"],
+                    **ind_kw,
                 )
                 # metric_vals1, metric_vals2, equal_var=False)
 
             scipy.stats.ttest_ind_from_stats
 
-            pair_stats['ttest_ind'] = ttest_ind_result
+            pair_stats["ttest_ind"] = ttest_ind_result
 
             # Do relative checks, need to find comparable subgroups
             metric_group1 = value_to_metric_group[param_val1]
@@ -588,18 +628,21 @@ class ResultAnalysis(ub.NiceRepr):
                 # Does this need to have the values aligned?
                 # I think that is the case giving my understanding of paired
                 # t-tests, but the docs need a PR to make that more clear.
-                ttest_rel_result = scipy.stats.ttest_rel(comparable_groups1, comparable_groups2)
-                pair_stats['n_common'] = len(common)
-                pair_stats['ttest_rel'] = ttest_rel_result
+                ttest_rel_result = scipy.stats.ttest_rel(
+                    comparable_groups1, comparable_groups2
+                )
+                pair_stats["n_common"] = len(common)
+                pair_stats["ttest_rel"] = ttest_rel_result
             pair_stats_list.append(pair_stats)
 
-        stats_row['pairwise'] = pair_stats_list
+        stats_row["pairwise"] = pair_stats_list
         return stats_row
 
     def build(self):
         import itertools as it
+
         if len(self.results) < 2:
-            raise Exception('need at least 2 results')
+            raise Exception("need at least 2 results")
 
         varied = self.varied.copy()
         if self.ignore_params:
@@ -614,21 +657,26 @@ class ResultAnalysis(ub.NiceRepr):
         # settings, for each group setting do the k=1 analysis within that group
         varied_param_names = list(varied.keys())
         num_varied_params = len(varied)
-        held_constant_orders = {num_varied_params + i if i < 0 else i for i in self.abalation_orders}
+        held_constant_orders = {
+            num_varied_params + i if i < 0 else i for i in self.abalation_orders
+        }
         held_constant_orders = [i for i in held_constant_orders if i > 0]
         held_constant_groups = []
         for k in held_constant_orders:
             held_constant_groups.extend(
-                list(map(list, it.combinations(varied_param_names, k))))
+                list(map(list, it.combinations(varied_param_names, k)))
+            )
 
         if self.metrics is None:
-            avail_metrics = set.intersection(*[set(r.metrics.keys()) for r in self.results])
+            avail_metrics = set.intersection(
+                *[set(r.metrics.keys()) for r in self.results]
+            )
             metrics_of_interest = sorted(avail_metrics - set(self.ignore_metrics))
         else:
             metrics_of_interest = self.metrics
         self.metrics_of_interest = metrics_of_interest
-        self._description['metrics_of_interest'] = metrics_of_interest
-        self._description['num_groups'] = len(held_constant_groups)
+        self._description["metrics_of_interest"] = metrics_of_interest
+        self._description["num_groups"] = len(held_constant_groups)
 
         # Analyze the impact of each parameter
         self.statistics = statistics = []
@@ -637,24 +685,29 @@ class ResultAnalysis(ub.NiceRepr):
                 stats_row = self.test_group(param_group, metric_key)
                 statistics.append(stats_row)
 
-        self.stats_table = pd.DataFrame([
-            ub.dict_diff(d, {'pairwise', 'param_values', 'moments'})
-            for d in self.statistics])
+        self.stats_table = pd.DataFrame(
+            [
+                ub.dict_diff(d, {"pairwise", "param_values", "moments"})
+                for d in self.statistics
+            ]
+        )
 
         if len(self.stats_table):
-            self.stats_table = self.stats_table.sort_values('anova_rank_p')
+            self.stats_table = self.stats_table.sort_values("anova_rank_p")
 
-        self._description['built'] = True
+        self._description["built"] = True
 
     def report(self):
-        stat_groups = ub.group_items(self.statistics, key=lambda x: x['param_name'])
+        stat_groups = ub.group_items(self.statistics, key=lambda x: x["param_name"])
         stat_groups_items = list(stat_groups.items())
 
         # Modify this order to change the grouping pattern
-        grid = ub.named_product({
-            'stat_group_item': stat_groups_items,
-            'metrics': self.metrics_of_interest,
-        })
+        grid = ub.named_product(
+            {
+                "stat_group_item": stat_groups_items,
+                "metrics": self.metrics_of_interest,
+            }
+        )
         for grid_item in grid:
             self._report_one(grid_item)
 
@@ -662,58 +715,76 @@ class ResultAnalysis(ub.NiceRepr):
 
     def _report_one(self, grid_item):
         p_threshold = self.p_threshold
-        metric_key = grid_item['metrics']
-        stat_groups_item = grid_item['stat_group_item']
+        metric_key = grid_item["metrics"]
+        stat_groups_item = grid_item["stat_group_item"]
 
         param_name, stat_group = stat_groups_item
-        stats_row = ub.group_items(stat_group, key=lambda x: x['metric'])[metric_key][0]
-        title = ('PARAMETER: {} - METRIC: {}'.format(param_name, metric_key))
-        print('\n\n')
+        stats_row = ub.group_items(stat_group, key=lambda x: x["metric"])[metric_key][0]
+        title = f"PARAMETER: {param_name} - METRIC: {metric_key}"
+        print("\n\n")
         print(title)
-        print('=' * len(title))
-        print(stats_row['moments'])
-        anova_rank_p = stats_row['anova_rank_p']
-        anova_mean_p = stats_row['anova_mean_p']
+        print("=" * len(title))
+        print(stats_row["moments"])
+        anova_rank_p = stats_row["anova_rank_p"]
+        anova_mean_p = stats_row["anova_mean_p"]
         # Rougly speaking
-        print('')
-        print(f'ANOVA: If p is low, the param {param_name!r} might have an effect')
-        print(ub.color_text(f'  Rank-ANOVA: p={anova_rank_p:0.8f}',
-                            'green' if anova_rank_p < p_threshold else None))
-        print(ub.color_text(f'  Mean-ANOVA: p={anova_mean_p:0.8f}',
-                            'green' if anova_mean_p < p_threshold else None))
-        print('')
-        print('Pairwise T-Tests')
-        for pairstat in stats_row['pairwise']:
+        print("")
+        print(f"ANOVA: If p is low, the param {param_name!r} might have an effect")
+        print(
+            ub.color_text(
+                f"  Rank-ANOVA: p={anova_rank_p:0.8f}",
+                "green" if anova_rank_p < p_threshold else None,
+            )
+        )
+        print(
+            ub.color_text(
+                f"  Mean-ANOVA: p={anova_mean_p:0.8f}",
+                "green" if anova_mean_p < p_threshold else None,
+            )
+        )
+        print("")
+        print("Pairwise T-Tests")
+        for pairstat in stats_row["pairwise"]:
             # Is this backwards?
-            value1 = pairstat['value1']
-            value2 = pairstat['value2']
-            winner = pairstat['winner']
+            value1 = pairstat["value1"]
+            value2 = pairstat["value2"]
+            winner = pairstat["winner"]
             if value2 == winner:
                 value1, value2 = value2, value1
-            print(f'  If p is low, {param_name}={value1} may outperform {param_name}={value2}.')
-            if 'ttest_ind' in pairstat:
-                ttest_ind_result = pairstat['ttest_ind']
-                print(ub.color_text(f'    ttest_ind:  p={ttest_ind_result.pvalue:0.8f}',
-                                    'green' if ttest_ind_result.pvalue < p_threshold else None))
-            if 'ttest_rel' in pairstat:
-                n_common = pairstat['n_common']
-                ttest_rel_result = pairstat['ttest_ind']
-                print(ub.color_text(f'    ttest_rel:  p={ttest_rel_result.pvalue:0.8f}, n_pairs={n_common}',
-                                    'green' if ttest_rel_result.pvalue < p_threshold else None))
+            print(
+                f"  If p is low, {param_name}={value1} may outperform {param_name}={value2}."
+            )
+            if "ttest_ind" in pairstat:
+                ttest_ind_result = pairstat["ttest_ind"]
+                print(
+                    ub.color_text(
+                        f"    ttest_ind:  p={ttest_ind_result.pvalue:0.8f}",
+                        "green" if ttest_ind_result.pvalue < p_threshold else None,
+                    )
+                )
+            if "ttest_rel" in pairstat:
+                n_common = pairstat["n_common"]
+                ttest_rel_result = pairstat["ttest_ind"]
+                print(
+                    ub.color_text(
+                        f"    ttest_rel:  p={ttest_rel_result.pvalue:0.8f}, n_pairs={n_common}",
+                        "green" if ttest_rel_result.pvalue < p_threshold else None,
+                    )
+                )
 
     def conclusions(self):
         conclusions = []
         for stat in self.statistics:
-            param_name = stat['param_name']
-            metric = stat['metric']
-            for pairstat in stat['pairwise']:
-                value1 = pairstat['value1']
-                value2 = pairstat['value2']
-                winner = pairstat['winner']
+            param_name = stat["param_name"]
+            metric = stat["metric"]
+            for pairstat in stat["pairwise"]:
+                value1 = pairstat["value1"]
+                value2 = pairstat["value2"]
+                winner = pairstat["winner"]
                 if value2 == winner:
                     value1, value2 = value2, value1
-                pvalue = stat = pairstat['ttest_ind'].pvalue
-                txt = (f'p={pvalue:0.8f}, If p is low, {param_name}={value1} may outperform {value2} on {metric}.')
+                pvalue = stat = pairstat["ttest_ind"].pvalue
+                txt = f"p={pvalue:0.8f}, If p is low, {param_name}={value1} may outperform {value2} on {metric}."
                 conclusions.append(txt)
         return conclusions
 
@@ -750,22 +821,24 @@ class ResultAnalysis(ub.NiceRepr):
             >>> self.plot(xlabel, metric_key, group_labels)
         """
         import seaborn as sns
+
         sns.set()
         from matplotlib import pyplot as plt  # NOQA
+
         data = self.table
         data = data.sort_values(metric_key)
         for gname, labels in group_labels.items():
             if len(labels):
                 new_col = []
-                for row in data[labels].to_dict('records'):
+                for row in data[labels].to_dict("records"):
                     item = ub.repr2(row, compact=1, si=1)
                     new_col.append(item)
                 gkey = gname + "_key"
                 data[gkey] = new_col
 
         plot_kws = {
-            'x': xlabel,
-            'y': metric_key,
+            "x": xlabel,
+            "y": metric_key,
         }
         for gname, labels in group_labels.items():
             if labels:
@@ -776,34 +849,34 @@ class ResultAnalysis(ub.NiceRepr):
         fig_params = plot_kws.pop("fig", [])
 
         facet_kws = {
-            'sharex': True,
-            'sharey': True,
+            "sharex": True,
+            "sharey": True,
         }
         # facet_kws['col'] = plot_kws.pop("col", None)
         # facet_kws['row'] = plot_kws.pop("row", None)
         # if not facet_kws['row']:
         #     facet_kws['col_wrap'] = 5
-        plot_kws['row'] = plot_kws.get("row", None)
+        plot_kws["row"] = plot_kws.get("row", None)
         # if not plot_kws['row']:
         #     plot_kws['col_wrap'] = 5
 
         if not fig_params:
-            groups = [('', data)]
+            groups = [("", data)]
         else:
             groups = data.groupby(fig_params)
 
-        if 'marker' not in plot_kws:
-            plot_kws['marker'] = "o"
+        if "marker" not in plot_kws:
+            plot_kws["marker"] = "o"
 
         # We will want to overwrite this with our own std estimate
-        plot_kws['ci'] = "sd"
+        plot_kws["ci"] = "sd"
         # err_style='band',
         # err_kws=None,
 
         # Use a consistent pallete across plots
-        unique_hues = data['hue_key'].unique()
+        unique_hues = data["hue_key"].unique()
         palette = ub.dzip(unique_hues, sns.color_palette(n_colors=len(unique_hues)))
-        plot_kws['palette'] = palette
+        plot_kws["palette"] = palette
 
         plots = []
         base_fnum = 1
@@ -819,9 +892,10 @@ class ResultAnalysis(ub.NiceRepr):
             facet = sns.relplot(
                 data=group,
                 # kind='line',
-                kind='scatter',
+                kind="scatter",
                 facet_kws=facet_kws,
-                **plot_kws)
+                **plot_kws,
+            )
 
             fig = facet.figure
             fig.suptitle(fig_key)
@@ -829,10 +903,12 @@ class ResultAnalysis(ub.NiceRepr):
             # facet = sns.FacetGrid(group, **facet_kws)
             # facet.map_dataframe(sns.lineplot, x=xlabel, y=metric_key, **plot_kws)
             # facet.add_legend()
-            plots.append({
-                'fig': fig,
-                'facet': facet,
-            })
+            plots.append(
+                {
+                    "fig": fig,
+                    "facet": facet,
+                }
+            )
         return plots
 
 
@@ -866,6 +942,7 @@ class SkillTracker:
 
     def __init__(self, player_ids):
         import openskill
+
         self.player_ids = player_ids
         self.ratings = {m: openskill.Rating() for m in player_ids}
         # self.observations = []
@@ -879,6 +956,7 @@ class SkillTracker:
             Dict[T, float]: mapping from player ids to win probabilites
         """
         from openskill import predict_win
+
         teams = [[p] for p in list(self.ratings.keys())]
         ratings = [[r] for r in self.ratings.values()]
         probs = predict_win(ratings)
@@ -897,6 +975,7 @@ class SkillTracker:
                 winners are at the front (0-th place) of the list.
         """
         import openskill
+
         # self.observations.append(ranking)
         ratings = self.ratings
         team_standings = [[r] for r in ub.take(ratings, ranking)]
diff --git a/json_benchmarks/benchmarker/util_json.py b/json_benchmarks/benchmarker/util_json.py
index dc3da85..7930b33 100644
--- a/json_benchmarks/benchmarker/util_json.py
+++ b/json_benchmarks/benchmarker/util_json.py
@@ -1,9 +1,10 @@
 import copy
+import json
+import pathlib
+from collections import OrderedDict
+
 import numpy as np
 import ubelt as ub
-import json
-from collections import OrderedDict
-import pathlib
 
 
 def ensure_json_serializable(dict_, normalize_containers=False, verbose=0):
@@ -64,7 +65,7 @@ def ensure_json_serializable(dict_, normalize_containers=False, verbose=0):
         elif isinstance(value, pathlib.Path):
             new_value = str(value)
             walker[prefix] = new_value
-        elif hasattr(value, '__json__'):
+        elif hasattr(value, "__json__"):
             new_value = value.__json__()
             walker[prefix] = new_value
         elif normalize_containers:
@@ -159,9 +160,9 @@ def find_json_unserializable(data, quickcheck=False):
                 # Purposely make loc non-hashable so its not confused with
                 # an address. All we can know in this case is that they key
                 # is at this level, there is no concept of where.
-                yield {'loc': root + [['.keys', key]], 'data': key}
+                yield {"loc": root + [[".keys", key]], "data": key}
             elif not isinstance(value, serializable_types):
-                yield {'loc': prefix, 'data': value}
+                yield {"loc": prefix, "data": value}
 
 
 def indexable_allclose(dct1, dct2, return_info=False):
@@ -189,19 +190,21 @@ def indexable_allclose(dct1, dct2, return_info=False):
     walker1 = ub.IndexableWalker(dct1)
     walker2 = ub.IndexableWalker(dct2)
     flat_items1 = [
-        (path, value) for path, value in walker1
-        if not isinstance(value, walker1.indexable_cls) or len(value) == 0]
+        (path, value)
+        for path, value in walker1
+        if not isinstance(value, walker1.indexable_cls) or len(value) == 0
+    ]
     flat_items2 = [
-        (path, value) for path, value in walker2
-        if not isinstance(value, walker1.indexable_cls) or len(value) == 0]
+        (path, value)
+        for path, value in walker2
+        if not isinstance(value, walker1.indexable_cls) or len(value) == 0
+    ]
 
     flat_items1 = sorted(flat_items1)
     flat_items2 = sorted(flat_items2)
 
     if len(flat_items1) != len(flat_items2):
-        info = {
-            'faillist': ['length mismatch']
-        }
+        info = {"faillist": ["length mismatch"]}
         final_flag = False
     else:
         passlist = []
@@ -212,9 +215,13 @@ def indexable_allclose(dct1, dct2, return_info=False):
             p2, v2 = t2
             assert p1 == p2
 
-            flag = (v1 == v2)
+            flag = v1 == v2
             if not flag:
-                if isinstance(v1, float) and isinstance(v2, float) and np.isclose(v1, v2):
+                if (
+                    isinstance(v1, float)
+                    and isinstance(v2, float)
+                    and np.isclose(v1, v2)
+                ):
                     flag = True
             if flag:
                 passlist.append(p1)
@@ -223,8 +230,8 @@ def indexable_allclose(dct1, dct2, return_info=False):
 
         final_flag = len(faillist) == 0
         info = {
-            'passlist': passlist,
-            'faillist': faillist,
+            "passlist": passlist,
+            "faillist": faillist,
         }
 
     if return_info:
diff --git a/json_benchmarks/benchmarker/visualize.py b/json_benchmarks/benchmarker/visualize.py
index 41f4679..f3d683a 100644
--- a/json_benchmarks/benchmarker/visualize.py
+++ b/json_benchmarks/benchmarker/visualize.py
@@ -2,7 +2,12 @@ import pandas as pd
 import ubelt as ub
 
 
-def benchmark_analysis(rows, xlabel, group_labels, basis, ):
+def benchmark_analysis(
+    rows,
+    xlabel,
+    group_labels,
+    basis,
+):
     # xlabel = "size"
     # Set these to empty lists if they are not used
     # group_labels = {
@@ -18,6 +23,7 @@ def benchmark_analysis(rows, xlabel, group_labels, basis, ):
     # key = ub.repr2(params, compact=1, si=1)
 
     from process_tracker.result_analysis import SkillTracker
+
     RECORD_ALL = 0
 
     USE_OPENSKILL = True
diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py
index 1bf6e65..306bf67 100644
--- a/json_benchmarks/core.py
+++ b/json_benchmarks/core.py
@@ -2,10 +2,11 @@
 Main definition of the benchmarks
 """
 import json
-import ubelt as ub
+
 import scriptconfig as scfg
-from json_benchmarks import benchmarker
-from json_benchmarks import datagen
+import ubelt as ub
+
+from json_benchmarks import benchmarker, datagen
 
 KNOWN_LIBRARIES = [
     "ujson",
@@ -20,39 +21,49 @@ class JSONBenchmarkConfig(scfg.Config):
     """
     Benchmark JSON implementations
     """
-    default = {
-        'disable': scfg.Value([], choices=KNOWN_LIBRARIES, help=ub.paragraph(
-            '''
-            Remove specified libraries from the benchmarks
-            '''
-        )),
 
-        'factor': scfg.Value(1.0, help=ub.paragraph(
-            '''
+    default = {
+        "disable": scfg.Value(
+            [],
+            choices=KNOWN_LIBRARIES,
+            help=ub.paragraph(
+                """
+            Remove specified libraries from the benchmarks
+            """
+            ),
+        ),
+        "factor": scfg.Value(
+            1.0,
+            help=ub.paragraph(
+                """
             Specify as a fraction to speed up benchmarks for development /
             testing
-            ''')),
-
-        'cache_dir': scfg.Value(None, help=ub.paragraph(
-            '''
+            """
+            ),
+        ),
+        "cache_dir": scfg.Value(
+            None,
+            help=ub.paragraph(
+                """
             Location for benchmark cache.
             Defaults to $XDG_CACHE/ujson/benchmark_results/
-            ''')),
+            """
+            ),
+        ),
     }
 
     def normalize(self):
-        dpath = self['cache_dir']
+        dpath = self["cache_dir"]
         if dpath is None:
-            dpath = ub.Path.appdir('ujson/benchmark_results')
+            dpath = ub.Path.appdir("ujson/benchmark_results")
         dpath = ub.Path(dpath)
-        self['cache_dir'] = dpath
+        self["cache_dir"] = dpath
 
 
 def available_json_impls():
     import importlib
-    known_modnames = [
-        'ujson', 'json', 'nujson', 'orjson', 'simplejson'
-    ]
+
+    known_modnames = ["ujson", "json", "nujson", "orjson", "simplejson"]
     json_impls = {}
     for libname in known_modnames:
         try:
@@ -61,8 +72,8 @@ def available_json_impls():
             pass
         else:
             json_impls[libname] = {
-                'module': module,
-                'version': module.__version__,
+                "module": module,
+                "version": module.__version__,
             }
     return json_impls
 
@@ -75,15 +86,15 @@ def benchmark_json():
     # These are the parameters that we benchmark over
     common_basis = {
         "impl": list(json_impls.keys()),
-        "func": ['dumps', 'loads'],
+        "func": ["dumps", "loads"],
     }
     sized_basis = {
         "input": [
-            'Array with doubles',
-            'Array with UTF-8 strings',
+            "Array with doubles",
+            "Array with UTF-8 strings",
             # 'Medium complex object',
-            'Array with True values',
-            'Array of Dict[str, int]',
+            "Array with True values",
+            "Array of Dict[str, int]",
             # 'Dict of List[Dict[str, int]]',
             # 'Complex object'
         ],
@@ -91,10 +102,8 @@ def benchmark_json():
         # 1024, 2048, 4096, 8192, 12288],
     }
     predefined_basis = {
-        "input": [
-            'Complex object'
-        ],
-        'size': [None],
+        "input": ["Complex object"],
+        "size": [None],
     }
 
     basis = [
@@ -106,7 +115,7 @@ def benchmark_json():
     # abstract away the details of timing a process over a grid of parameters,
     # serializing the results, and aggregating results from disparate runs.
     benchmark = benchmarker.Benchmarker(
-        name='bench_json',
+        name="bench_json",
         num=100,
         bestof=10,
         verbose=3,
@@ -114,7 +123,7 @@ def benchmark_json():
     )
 
     def is_blocked(params):
-        if params['input'] == 'Complex object' and params['impl'] == 'orjson':
+        if params["input"] == "Complex object" and params["impl"] == "orjson":
             return True
 
     # For each variation of your experiment, create a row.
@@ -124,12 +133,12 @@ def benchmark_json():
         # Make any modifications you need to compute input kwargs for each
         # method here.
         impl_info = json_impls[params["impl"]]
-        params["impl_version"] = impl_info['version']
-        module = impl_info['module']
-        if params['func'] == 'dumps':
+        params["impl_version"] = impl_info["version"]
+        module = impl_info["module"]
+        if params["func"] == "dumps":
             method = module.dumps
             data = data_lut[params["input"]](params["size"])
-        elif params['func'] == 'loads':
+        elif params["func"] == "loads":
             method = module.loads
             to_encode = data_lut[params["input"]](params["size"])
             data = json.dumps(to_encode)
@@ -142,17 +151,18 @@ def benchmark_json():
                 # Put the logic you want to time here
                 method(data)
 
-    dpath = ub.Path.appdir('ujson/benchmark_results').ensuredir()
+    dpath = ub.Path.appdir("ujson/benchmark_results").ensuredir()
     result_fpath = benchmark.dump_in_dpath(dpath)
     return result_fpath
 
 
 def aggregate_results(result_fpaths):
     import json
+
     results = []
     for fpath in result_fpaths:
         data = json.loads(fpath.read_text())
-        for row in data['rows']:
+        for row in data["rows"]:
             result = benchmarker.BenchmarkerResult.load(fpath)
             results.extend(result.to_result_list())
 
@@ -164,12 +174,13 @@ def aggregate_results(result_fpaths):
     analysis = benchmarker.result_analysis.ResultAnalysis(
         results,
         metrics=[metric_key],
-        params=['impl'],
+        params=["impl"],
         metric_objectives={
-            'min_time': 'min',
-            'mean_time': 'min',
-            'time': 'min',
-        })
+            "min_time": "min",
+            "mean_time": "min",
+            "time": "min",
+        },
+    )
     analysis.analysis()
 
     table = analysis.table
@@ -182,21 +193,23 @@ def aggregate_results(result_fpaths):
         otherwise the new column for that row is set to None.
         """
         import pandas as pd
+
         # Stats groupings
         stats_cols = [
-            'nobs_time',
-            'std_time',
-            'mean_time',
-            'max_time',
-            'min_time',
+            "nobs_time",
+            "std_time",
+            "mean_time",
+            "max_time",
+            "min_time",
         ]
-        mapper = {c: c.replace('_time', '') for c in stats_cols}
+        mapper = {c: c.replace("_time", "") for c in stats_cols}
         unmapper = ub.invert_dict(mapper)
         non_stats_cols = list(ub.oset(data.columns) - stats_cols)
         if group_keys is None:
             group_keys = non_stats_cols
         non_group_keys = list(ub.oset(non_stats_cols) - group_keys)
         from json_benchmarks.benchmarker.benchmarker import combine_stats_arrs
+
         new_rows = []
         for group_vals, group in list(data.groupby(group_keys)):
             # hack, is this a pandas bug in 1.4.1? Is it fixed
@@ -218,16 +231,18 @@ def aggregate_results(result_fpaths):
         new_data = pd.DataFrame(new_rows)
         return new_data
 
-    single_size = table[(table['size'] == 256) | table['size'].isnull()]
+    single_size = table[(table["size"] == 256) | table["size"].isnull()]
     # single_size_combo = aggregate_time_stats(single_size, None)
-    single_size_combo = aggregate_time_stats(single_size, ['name'])
+    single_size_combo = aggregate_time_stats(single_size, ["name"])
 
-    param_group = ['impl', 'impl_version']
-    single_size_combo['calls/sec'] = 1 / single_size_combo['mean_time']
+    param_group = ["impl", "impl_version"]
+    single_size_combo["calls/sec"] = 1 / single_size_combo["mean_time"]
     _single_size_combo = single_size_combo.copy()
-    _single_size_combo['calls/sec'] = _single_size_combo['calls/sec'].apply(lambda x: '{:,.02f}'.format(x))
-    piv = _single_size_combo.pivot(['input', 'func'], param_group, 'calls/sec')
-    print('Table for size=256')
+    _single_size_combo["calls/sec"] = _single_size_combo["calls/sec"].apply(
+        lambda x: f"{x:,.02f}"
+    )
+    piv = _single_size_combo.pivot(["input", "func"], param_group, "calls/sec")
+    print("Table for size=256")
     print(piv)
 
     analysis.abalate(param_group)
@@ -242,29 +257,31 @@ def aggregate_results(result_fpaths):
         "size": [],
     }
     import kwplot
+
     kwplot.autosns()
     plots = analysis.plot(xlabel, metric_key, group_labels)
     for plot in plots:
-        for ax in plot['facet'].axes.ravel():
-            ax.set_xscale('log')
-            ax.set_yscale('log')
+        for ax in plot["facet"].axes.ravel():
+            ax.set_xscale("log")
+            ax.set_yscale("log")
     kwplot.show_if_requested()
 
 
 def main():
     from json_benchmarks import core
+
     config = core.JSONBenchmarkConfig(cmdline=True)
-    dpath = config['cache_dir']
+    dpath = config["cache_dir"]
 
     run = 1
     if run:
         result_fpath = core.benchmark_json()
-        print('result_fpath = {!r}'.format(result_fpath))
+        print(f"result_fpath = {result_fpath!r}")
         result_fpaths = [result_fpath]
 
     agg = 1
     if agg:
-        result_fpaths = list(dpath.glob('benchmarks*.json'))
+        result_fpaths = list(dpath.glob("benchmarks*.json"))
 
     core.aggregate_results(result_fpaths)
     # results_output_table(libraries)
diff --git a/json_benchmarks/datagen.py b/json_benchmarks/datagen.py
index afb2708..f7f1f71 100644
--- a/json_benchmarks/datagen.py
+++ b/json_benchmarks/datagen.py
@@ -1,5 +1,6 @@
 import random
 import sys
+
 import ubelt as ub
 
 
@@ -22,21 +23,23 @@ def json_test_data_generators():
         >>>     print('test_object = {!r}'.format(test_object))
     """
     data_lut = {}
+
     def _register_data(name):
         def _wrap(func):
             data_lut[name] = func
+
         return _wrap
 
     # seed if desired
     # rng = random.Random(0)
     rng = random
 
-    @_register_data('Array with doubles')
+    @_register_data("Array with doubles")
     def array_with_doubles(size):
         test_object = [sys.maxsize * rng.random() for _ in range(size)]
         return test_object
 
-    @_register_data('Array with UTF-8 strings')
+    @_register_data("Array with UTF-8 strings")
     def array_with_utf8_strings(size):
         utf8_string = (
             "نظام الحكم سلطاني وراثي "
@@ -46,7 +49,7 @@ def json_test_data_generators():
         test_object = [utf8_string for _ in range(size)]
         return test_object
 
-    @_register_data('Medium complex object')
+    @_register_data("Medium complex object")
     def medium_complex_object(size):
         user = {
             "userId": 3381293,
@@ -63,20 +66,19 @@ def json_test_data_generators():
         test_object = [[user, friends] for _ in range(size)]
         return test_object
 
-    @_register_data('Array with True values')
+    @_register_data("Array with True values")
     def true_values(size):
         test_object = [True for _ in range(size)]
         return test_object
 
-    @_register_data('Array of Dict[str, int]')
+    @_register_data("Array of Dict[str, int]")
     def array_of_dict_string_int(size):
         test_object = [
-            {str(rng.random() * 20): int(rng.random() * 1000000)}
-            for _ in range(size)
+            {str(rng.random() * 20): int(rng.random() * 1000000)} for _ in range(size)
         ]
         return test_object
 
-    @_register_data('Dict of List[Dict[str, int]]')
+    @_register_data("Dict of List[Dict[str, int]]")
     def dict_of_list_dict_str_int(size):
         keys = set()
         while len(keys) < size:
@@ -91,23 +93,25 @@ def json_test_data_generators():
         }
         return test_object
 
-    @_register_data('Complex object')
+    @_register_data("Complex object")
     def complex_object(size):
         import json
+
         # TODO: might be better to reigster this file with setup.py or
         # download it via some mechanism
         try:
             dpath = ub.Path(__file__).parent
-            fpath = dpath / 'sample.json'
+            fpath = dpath / "sample.json"
             if not fpath.exists():
                 raise Exception
         except Exception:
             import ujson
-            dpath = ub.Path(ujson.__file__).parent / 'tests'
-            fpath = dpath / 'sample.json'
+
+            dpath = ub.Path(ujson.__file__).parent / "tests"
+            fpath = dpath / "sample.json"
             if not fpath.exists():
                 raise Exception
-        with open(fpath, 'r') as f:
+        with open(fpath) as f:
             test_object = json.load(f)
         if size is not None:
             test_object = [test_object] * size

From 78cbf7ea71766bafc6dd49cb3c13eba278317fbf Mon Sep 17 00:00:00 2001
From: joncrall <jon.crall@kitware.com>
Date: Sat, 28 May 2022 21:56:26 -0400
Subject: [PATCH 09/25] tweaks

---
 json_benchmarks/benchmarker/__init__.py       |  84 +++----
 json_benchmarks/benchmarker/benchmarker.py    |  83 +------
 .../benchmarker/result_analysis.py            |  33 ++-
 json_benchmarks/benchmarker/util_stats.py     | 235 ++++++++++++++++++
 json_benchmarks/core.py                       | 157 ++++++------
 5 files changed, 367 insertions(+), 225 deletions(-)
 create mode 100644 json_benchmarks/benchmarker/util_stats.py

diff --git a/json_benchmarks/benchmarker/__init__.py b/json_benchmarks/benchmarker/__init__.py
index ee32ea0..5614b61 100644
--- a/json_benchmarks/benchmarker/__init__.py
+++ b/json_benchmarks/benchmarker/__init__.py
@@ -9,57 +9,35 @@ mkinit ~/code/ultrajson/json_benchmarks/benchmarker/__init__.py -w
 
 __version__ = "0.1.0"
 
-from json_benchmarks.benchmarker import (
-    aggregate,
-    benchmarker,
-    process_context,
-    result_analysis,
-    util_json,
-    visualize,
-)
-from json_benchmarks.benchmarker.aggregate import demo, demo_data
-from json_benchmarks.benchmarker.benchmarker import (
-    Benchmarker,
-    BenchmarkerConfig,
-    BenchmarkerResult,
-    combine_stats,
-    stats_dict,
-)
-from json_benchmarks.benchmarker.process_context import ProcessContext
-from json_benchmarks.benchmarker.result_analysis import (
-    DEFAULT_METRIC_TO_OBJECTIVE,
-    Result,
-    ResultAnalysis,
-    SkillTracker,
-)
-from json_benchmarks.benchmarker.util_json import (
-    ensure_json_serializable,
-    find_json_unserializable,
-    indexable_allclose,
-)
-from json_benchmarks.benchmarker.visualize import benchmark_analysis
+from json_benchmarks.benchmarker import aggregate
+from json_benchmarks.benchmarker import benchmarker
+from json_benchmarks.benchmarker import process_context
+from json_benchmarks.benchmarker import result_analysis
+from json_benchmarks.benchmarker import util_json
+from json_benchmarks.benchmarker import util_stats
+from json_benchmarks.benchmarker import visualize
 
-__all__ = [
-    "Benchmarker",
-    "BenchmarkerConfig",
-    "BenchmarkerResult",
-    "DEFAULT_METRIC_TO_OBJECTIVE",
-    "ProcessContext",
-    "Result",
-    "ResultAnalysis",
-    "SkillTracker",
-    "aggregate",
-    "benchmark_analysis",
-    "benchmarker",
-    "combine_stats",
-    "demo",
-    "demo_data",
-    "ensure_json_serializable",
-    "find_json_unserializable",
-    "indexable_allclose",
-    "process_context",
-    "result_analysis",
-    "stats_dict",
-    "util_json",
-    "visualize",
-]
+from json_benchmarks.benchmarker.aggregate import (demo, demo_data,)
+from json_benchmarks.benchmarker.benchmarker import (Benchmarker,
+                                                     BenchmarkerConfig,
+                                                     BenchmarkerResult,)
+from json_benchmarks.benchmarker.process_context import (ProcessContext,)
+from json_benchmarks.benchmarker.result_analysis import (
+    DEFAULT_METRIC_TO_OBJECTIVE, Result, ResultAnalysis, SkillTracker,)
+from json_benchmarks.benchmarker.util_json import (ensure_json_serializable,
+                                                   find_json_unserializable,
+                                                   indexable_allclose,)
+from json_benchmarks.benchmarker.util_stats import (aggregate_stats,
+                                                    combine_stats,
+                                                    combine_stats_arrs,
+                                                    stats_dict,)
+from json_benchmarks.benchmarker.visualize import (benchmark_analysis,)
+
+__all__ = ['Benchmarker', 'BenchmarkerConfig', 'BenchmarkerResult',
+           'DEFAULT_METRIC_TO_OBJECTIVE', 'ProcessContext', 'Result',
+           'ResultAnalysis', 'SkillTracker', 'aggregate', 'aggregate_stats',
+           'benchmark_analysis', 'benchmarker', 'combine_stats',
+           'combine_stats_arrs', 'demo', 'demo_data',
+           'ensure_json_serializable', 'find_json_unserializable',
+           'indexable_allclose', 'process_context', 'result_analysis',
+           'stats_dict', 'util_json', 'util_stats', 'visualize']
diff --git a/json_benchmarks/benchmarker/benchmarker.py b/json_benchmarks/benchmarker/benchmarker.py
index 008ba82..24859ed 100644
--- a/json_benchmarks/benchmarker/benchmarker.py
+++ b/json_benchmarks/benchmarker/benchmarker.py
@@ -158,91 +158,12 @@ class Benchmarker:
                 }
                 rows.append(row)
         else:
+            from json_benchmarks.benchmarker import util_stats
             times = np.array(ti.robust_times())
-            metrics = stats_dict(times, "_time")
+            metrics = util_stats.stats_dict(times, "_time")
             row = {
                 "metrics": metrics,
                 "params": params,
                 "name": key,
             }
             rows.append(row)
-
-
-def stats_dict(data, suffix=""):
-    stats = {
-        "nobs" + suffix: len(data),
-        "mean" + suffix: data.mean(),
-        "std" + suffix: data.std(),
-        "min" + suffix: data.min(),
-        "max" + suffix: data.max(),
-    }
-    return stats
-
-
-def combine_stats(s1, s2):
-    """
-    Helper for combining mean and standard deviation of multiple measurements
-
-    Args:
-        s1 (dict): stats dict containing mean, std, and n
-        s2 (dict): stats dict containing mean, std, and n
-
-    Example:
-        >>> basis = {
-        >>>     'nobs1': [1, 10, 100, 10000],
-        >>>     'nobs2': [1, 10, 100, 10000],
-        >>> }
-        >>> for params in ub.named_product(basis):
-        >>>     data1 = np.random.rand(params['nobs1'])
-        >>>     data2 = np.random.rand(params['nobs2'])
-        >>>     data3 = np.hstack([data1, data2])
-        >>>     s1 = stats_dict(data1)
-        >>>     s2 = stats_dict(data2)
-        >>>     s3 = stats_dict(data3)
-        >>>     # Check that our combo works
-        >>>     combo_s3 = combine_stats(s1, s2)
-        >>>     compare = pd.DataFrame({'raw': s3, 'combo': combo_s3})
-        >>>     print(compare)
-        >>>     assert np.allclose(compare.raw, compare.combo)
-
-    References:
-        https://stackoverflow.com/questions/7753002/adding-combining-standard-deviations
-        https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups
-    """
-    stats = [s1, s2]
-    data = {
-        "nobs": np.array([s["nobs"] for s in stats]),
-        "mean": np.array([s["mean"] for s in stats]),
-        "std": np.array([s["std"] for s in stats]),
-        "min": np.array([s["min"] for s in stats]),
-        "max": np.array([s["max"] for s in stats]),
-    }
-    combine_stats_arrs(data)
-
-
-def combine_stats_arrs(data):
-    sizes = data["nobs"]
-    means = data["mean"]
-    stds = data["std"]
-    mins = data["min"]
-    maxs = data["max"]
-    varis = stds * stds
-
-    combo_size = sizes.sum()
-    combo_mean = (sizes * means).sum() / combo_size
-
-    mean_deltas = means - combo_mean
-
-    sv = (sizes * varis).sum()
-    sm = (sizes * (mean_deltas * mean_deltas)).sum()
-    combo_vars = (sv + sm) / combo_size
-    combo_std = np.sqrt(combo_vars)
-
-    combo_stats = {
-        "nobs": combo_size,
-        "mean": combo_mean,
-        "std": combo_std,
-        "min": mins.min(),
-        "max": maxs.max(),
-    }
-    return combo_stats
diff --git a/json_benchmarks/benchmarker/result_analysis.py b/json_benchmarks/benchmarker/result_analysis.py
index 944d85e..81865e2 100644
--- a/json_benchmarks/benchmarker/result_analysis.py
+++ b/json_benchmarks/benchmarker/result_analysis.py
@@ -788,7 +788,7 @@ class ResultAnalysis(ub.NiceRepr):
                 conclusions.append(txt)
         return conclusions
 
-    def plot(self, xlabel, metric_key, group_labels):
+    def plot(self, xlabel, metric_key, group_labels, **kwargs):
         """
         Args:
             group_labels (dict):
@@ -818,10 +818,10 @@ class ResultAnalysis(ub.NiceRepr):
             >>>     'hue': ['z'],
             >>>     'size': [],
             >>> }
-            >>> self.plot(xlabel, metric_key, group_labels)
+            >>> kwargs = {'xscale': 'log', 'yscale': 'log'}
+            >>> self.plot(xlabel, metric_key, group_labels, **kwargs)
         """
         import seaborn as sns
-
         sns.set()
         from matplotlib import pyplot as plt  # NOQA
 
@@ -903,12 +903,27 @@ class ResultAnalysis(ub.NiceRepr):
             # facet = sns.FacetGrid(group, **facet_kws)
             # facet.map_dataframe(sns.lineplot, x=xlabel, y=metric_key, **plot_kws)
             # facet.add_legend()
-            plots.append(
-                {
-                    "fig": fig,
-                    "facet": facet,
-                }
-            )
+
+            plot = {
+                "fig": fig,
+                "facet": facet,
+            }
+            plots.append(plot)
+
+        for plot in plots:
+            xscale = kwargs.get('xscale', None)
+            yscale = kwargs.get('yscale', None)
+            for ax in plot['facet'].axes.ravel():
+                if xscale is not None:
+                    try:
+                        ax.set_xscale(xscale)
+                    except ValueError:
+                        pass
+                if yscale is not None:
+                    try:
+                        ax.set_yscale(yscale)
+                    except ValueError:
+                        pass
         return plots
 
 
diff --git a/json_benchmarks/benchmarker/util_stats.py b/json_benchmarks/benchmarker/util_stats.py
new file mode 100644
index 0000000..3d12965
--- /dev/null
+++ b/json_benchmarks/benchmarker/util_stats.py
@@ -0,0 +1,235 @@
+import ubelt as ub
+import numpy as np
+
+def __tabulate_issue():
+    # MWE for tabulate issue
+    # The decimals are not aligned when using "," in the floatfmt
+    import tabulate
+    data = [
+        [13213.2, 3213254.23, 432432.231,],
+        [432432., 432.3, 3.2]
+    ]
+    print(tabulate.tabulate(data, headers=['a', 'b'], floatfmt=',.02f'))
+    print(tabulate.tabulate(data, headers=['a', 'b'], floatfmt='.02f'))
+
+
+def __groupby_issue():
+    # MWE of an issue with pandas groupby
+    import pandas as pd
+    data = pd.DataFrame([
+        {'p1': 'a', 'p2': 1, 'p3': 0},
+        {'p1': 'a', 'p2': 1, 'p3': 0},
+        {'p1': 'a', 'p2': 2, 'p3': 0},
+        {'p1': 'b', 'p2': 2, 'p3': 0},
+        {'p1': 'b', 'p2': 1, 'p3': 0},
+        {'p1': 'b', 'p2': 1, 'p3': 0},
+        {'p1': 'b', 'p2': 1, 'p3': 0},
+    ])
+
+    by = 'p1'
+    key = list(data.groupby(by))[0][0]
+    result = {
+        'by': by,
+        'key': key,
+        'type(key)': type(key)
+    }
+    print('result = {}'.format(ub.repr2(result, nl=1)))
+    assert not ub.iterable(key), (
+        '`by` is specified as a scalar, so getting `key` as a scalar makes sense')
+
+    by = ['p1']
+    key = list(data.groupby(by))[0][0]
+    result = {
+        'by': by,
+        'key': key,
+        'type(key)': type(key)
+    }
+    print('result = {}'.format(ub.repr2(result, nl=1)))
+    assert not ub.iterable(key), (
+        '`by` is specified as a list of scalars (with one element), but we '
+        'still get `key` as a scalar. This does not make sense')
+
+    by = ['p1', 'p2']
+    key = list(data.groupby(by))[0][0]
+    result = {
+        'by': by,
+        'key': key,
+        'type(key)': type(key)
+    }
+    print('result = {}'.format(ub.repr2(result, nl=1)))
+    assert ub.iterable(key), (
+        '`by` is specified as a list of scalars (with multiple elements), '
+        'and we still get `key` as a tuple of values. This makes sense')
+
+
+def aggregate_stats(data, suffix='', group_keys=None):
+    """
+    Given columns interpreted as containing stats, aggregate those stats
+    within each group. For each row, any non-group, non-stat column
+    with consistent values across that columns in the group is kept as-is,
+    otherwise the new column for that row is set to None.
+
+    Args:
+        data (DataFrame):
+            a data frame with columns: 'mean', 'std', 'min', 'max', and 'nobs'
+            (possibly with a suffix)
+
+        suffix (str):
+            if the nobs, std, mean, min, and max have a suffix, specify it
+
+        group_keys (List[str]):
+            pass
+
+    Returns:
+        DataFrame:
+            New dataframe where grouped rows have been aggregated into a single
+            row.
+
+    Example:
+        >>> import sys, ubelt
+        >>> sys.path.append(ubelt.expandpath('~/code/ultrajson'))
+        >>> from json_benchmarks.benchmarker.util_stats import *  # NOQA
+        >>> import pandas as pd
+        >>> data = pd.DataFrame([
+        >>>     #
+        >>>     {'mean': 8, 'std': 1, 'min': 0, 'max': 1, 'nobs': 2, 'p1': 'a', 'p2': 1},
+        >>>     {'mean': 6, 'std': 2, 'min': 0, 'max': 1, 'nobs': 3, 'p1': 'a', 'p2': 1},
+        >>>     {'mean': 7, 'std': 3, 'min': 0, 'max': 2, 'nobs': 5, 'p1': 'a', 'p2': 2},
+        >>>     {'mean': 5, 'std': 4, 'min': 0, 'max': 3, 'nobs': 7, 'p1': 'a', 'p2': 1},
+        >>>     #
+        >>>     {'mean': 3, 'std': 1, 'min': 0, 'max': 20, 'nobs': 6, 'p1': 'b', 'p2': 1},
+        >>>     {'mean': 0, 'std': 2, 'min': 0, 'max': 20, 'nobs': 26, 'p1': 'b', 'p2': 2},
+        >>>     {'mean': 9, 'std': 3, 'min': 0, 'max': 20, 'nobs': 496, 'p1': 'b', 'p2': 1},
+        >>>     #
+        >>>     {'mean': 5, 'std': 0, 'min': 0, 'max': 1, 'nobs': 2, 'p1': 'c', 'p2': 2},
+        >>>     {'mean': 5, 'std': 0, 'min': 0, 'max': 1, 'nobs': 7, 'p1': 'c', 'p2': 2},
+        >>>     #
+        >>>     {'mean': 5, 'std': 2, 'min': 0, 'max': 2, 'nobs': 7, 'p1': 'd', 'p2': 2},
+        >>>     #
+        >>>     {'mean': 5, 'std': 2, 'min': 0, 'max': 2, 'nobs': 7, 'p1': 'e', 'p2': 1},
+        >>> ])
+        >>> print(data)
+        >>> new_data = aggregate_stats(data)
+        >>> print(new_data)
+        >>> new_data1 = aggregate_stats(data, group_keys=['p1'])
+        >>> print(new_data1)
+        >>> new_data2 = aggregate_stats(data, group_keys=['p2'])
+        >>> print(new_data2)
+    """
+    import pandas as pd
+
+    # Stats groupings
+    raw_stats_cols = ["nobs", "std", "mean", "max", "min"]
+    stats_cols = [c + suffix for c in raw_stats_cols]
+    mapper = dict(zip(stats_cols, raw_stats_cols))
+    unmapper = dict(zip(raw_stats_cols, stats_cols))
+    non_stats_cols = list(ub.oset(data.columns) - stats_cols)
+    if group_keys is None:
+        group_keys = non_stats_cols
+    non_group_keys = list(ub.oset(non_stats_cols) - group_keys)
+
+    new_rows = []
+    for group_vals, group in list(data.groupby(group_keys)):
+        # hack, is this a pandas bug in 1.4.1? Is it fixed? (Not in 1.4.2)
+        if isinstance(group_keys, list) and len(group_keys) == 1:
+            # For some reason, when we specify group keys as a list of one
+            # element, we get a squeezed value out
+            group_vals = (group_vals,)
+        stat_data = group[stats_cols].rename(mapper, axis=1)
+        new_stats = combine_stats_arrs(stat_data)
+        new_time_stats = ub.map_keys(unmapper, new_stats)
+        new_row = ub.dzip(group_keys, group_vals)
+        if non_group_keys:
+            for k in non_group_keys:
+                unique_vals = group[k].unique()
+                if len(unique_vals) == 1:
+                    new_row[k] = unique_vals[0]
+                else:
+                    new_row[k] = None
+        new_row.update(new_time_stats)
+        new_rows.append(new_row)
+    new_data = pd.DataFrame(new_rows)
+    return new_data
+
+
+def stats_dict(data, suffix=""):
+    stats = {
+        "nobs" + suffix: len(data),
+        "mean" + suffix: data.mean(),
+        "std" + suffix: data.std(),
+        "min" + suffix: data.min(),
+        "max" + suffix: data.max(),
+    }
+    return stats
+
+
+def combine_stats(s1, s2):
+    """
+    Helper for combining mean and standard deviation of multiple measurements
+
+    Args:
+        s1 (dict): stats dict containing mean, std, and n
+        s2 (dict): stats dict containing mean, std, and n
+
+    Example:
+        >>> basis = {
+        >>>     'nobs1': [1, 10, 100, 10000],
+        >>>     'nobs2': [1, 10, 100, 10000],
+        >>> }
+        >>> for params in ub.named_product(basis):
+        >>>     data1 = np.random.rand(params['nobs1'])
+        >>>     data2 = np.random.rand(params['nobs2'])
+        >>>     data3 = np.hstack([data1, data2])
+        >>>     s1 = stats_dict(data1)
+        >>>     s2 = stats_dict(data2)
+        >>>     s3 = stats_dict(data3)
+        >>>     # Check that our combo works
+        >>>     combo_s3 = combine_stats(s1, s2)
+        >>>     compare = pd.DataFrame({'raw': s3, 'combo': combo_s3})
+        >>>     print(compare)
+        >>>     assert np.allclose(compare.raw, compare.combo)
+
+    References:
+        https://stackoverflow.com/questions/7753002/adding-combining-standard-deviations
+        https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups
+    """
+    stats = [s1, s2]
+    data = {
+        "nobs": np.array([s["nobs"] for s in stats]),
+        "mean": np.array([s["mean"] for s in stats]),
+        "std": np.array([s["std"] for s in stats]),
+        "min": np.array([s["min"] for s in stats]),
+        "max": np.array([s["max"] for s in stats]),
+    }
+    combine_stats_arrs(data)
+
+
+def combine_stats_arrs(data):
+    sizes = data["nobs"]
+    means = data["mean"]
+    stds = data["std"]
+    mins = data["min"]
+    maxs = data["max"]
+    varis = stds * stds
+
+    # TODO: ddof
+    # https://github.com/Erotemic/misc/blob/28cf797b9b0f8bd82e3ebee2f6d0a688ecee2838/learn/stats.py#L128
+
+    combo_size = sizes.sum()
+    combo_mean = (sizes * means).sum() / combo_size
+
+    mean_deltas = means - combo_mean
+
+    sv = (sizes * varis).sum()
+    sm = (sizes * (mean_deltas * mean_deltas)).sum()
+    combo_vars = (sv + sm) / combo_size
+    combo_std = np.sqrt(combo_vars)
+
+    combo_stats = {
+        "nobs": combo_size,
+        "mean": combo_mean,
+        "std": combo_std,
+        "min": mins.min(),
+        "max": maxs.max(),
+    }
+    return combo_stats
diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py
index 306bf67..18617c7 100644
--- a/json_benchmarks/core.py
+++ b/json_benchmarks/core.py
@@ -7,6 +7,7 @@ import scriptconfig as scfg
 import ubelt as ub
 
 from json_benchmarks import benchmarker, datagen
+from json_benchmarks.benchmarker import util_stats
 
 KNOWN_LIBRARIES = [
     "ujson",
@@ -23,31 +24,50 @@ class JSONBenchmarkConfig(scfg.Config):
     """
 
     default = {
+        "mode": scfg.Value(
+            "all",
+            position=1,
+            choices=["all", "single", "run", "analyze"],
+            help=ub.paragraph(
+                """
+                By default all benchmarks are run, saved, and aggregated
+                with any other existing benchmarks for analysis and
+                visualization.
+
+                In "single" mode, other existing benchmarks are ignord.
+
+                In "run" mode, the benchmarks are run, but no analysis is done.
+
+                In "analyze" mode, no benchmarks are run, but any existing
+                benchmarks are loaded for analysis and visualization.
+                """)
+        ),
+
         "disable": scfg.Value(
             [],
             choices=KNOWN_LIBRARIES,
             help=ub.paragraph(
                 """
-            Remove specified libraries from the benchmarks
-            """
+                Remove specified libraries from the benchmarks
+                """
             ),
         ),
         "factor": scfg.Value(
             1.0,
             help=ub.paragraph(
                 """
-            Specify as a fraction to speed up benchmarks for development /
-            testing
-            """
+                Specify as a fraction to speed up benchmarks for development /
+                testing
+                """
             ),
         ),
         "cache_dir": scfg.Value(
             None,
             help=ub.paragraph(
                 """
-            Location for benchmark cache.
-            Defaults to $XDG_CACHE/ujson/benchmark_results/
-            """
+                Location for benchmark cache.
+                Defaults to $XDG_CACHE/ujson/benchmark_results/
+                """
             ),
         ),
     }
@@ -62,8 +82,7 @@ class JSONBenchmarkConfig(scfg.Config):
 
 def available_json_impls():
     import importlib
-
-    known_modnames = ["ujson", "json", "nujson", "orjson", "simplejson"]
+    known_modnames = KNOWN_LIBRARIES
     json_impls = {}
     for libname in known_modnames:
         try:
@@ -116,8 +135,8 @@ def benchmark_json():
     # serializing the results, and aggregating results from disparate runs.
     benchmark = benchmarker.Benchmarker(
         name="bench_json",
-        num=100,
-        bestof=10,
+        num=1000,
+        bestof=100,
         verbose=3,
         basis=basis,
     )
@@ -156,7 +175,7 @@ def benchmark_json():
     return result_fpath
 
 
-def aggregate_results(result_fpaths):
+def analyze_results(result_fpaths):
     import json
 
     results = []
@@ -185,65 +204,28 @@ def aggregate_results(result_fpaths):
 
     table = analysis.table
 
-    def aggregate_time_stats(data, group_keys=None):
-        """
-        Given columns interpreted as containing stats, aggregate those stats
-        within each group. For each row, any non-group, non-stat column
-        with consistent values across that columns in the group is kept as-is,
-        otherwise the new column for that row is set to None.
-        """
-        import pandas as pd
-
-        # Stats groupings
-        stats_cols = [
-            "nobs_time",
-            "std_time",
-            "mean_time",
-            "max_time",
-            "min_time",
-        ]
-        mapper = {c: c.replace("_time", "") for c in stats_cols}
-        unmapper = ub.invert_dict(mapper)
-        non_stats_cols = list(ub.oset(data.columns) - stats_cols)
-        if group_keys is None:
-            group_keys = non_stats_cols
-        non_group_keys = list(ub.oset(non_stats_cols) - group_keys)
-        from json_benchmarks.benchmarker.benchmarker import combine_stats_arrs
-
-        new_rows = []
-        for group_vals, group in list(data.groupby(group_keys)):
-            # hack, is this a pandas bug in 1.4.1? Is it fixed
-            if isinstance(group_keys, list) and not isinstance(group_vals, list):
-                group_vals = [group_vals]
-            stat_data = group[stats_cols].rename(mapper, axis=1)
-            new_stats = combine_stats_arrs(stat_data)
-            new_time_stats = ub.map_keys(unmapper, new_stats)
-            new_row = ub.dzip(group_keys, group_vals)
-            if non_group_keys:
-                for k in non_group_keys:
-                    unique_vals = group[k].unique()
-                    if len(unique_vals) == 1:
-                        new_row[k] = unique_vals[0]
-                    else:
-                        new_row[k] = None
-            new_row.update(new_time_stats)
-            new_rows.append(new_row)
-        new_data = pd.DataFrame(new_rows)
-        return new_data
-
     single_size = table[(table["size"] == 256) | table["size"].isnull()]
-    # single_size_combo = aggregate_time_stats(single_size, None)
-    single_size_combo = aggregate_time_stats(single_size, ["name"])
+    # single_size_combo = aggregate_stats(single_size, None)
+    single_size_combo = util_stats.aggregate_stats(single_size, suffix='_time', group_keys=["name"])
 
     param_group = ["impl", "impl_version"]
     single_size_combo["calls/sec"] = 1 / single_size_combo["mean_time"]
-    _single_size_combo = single_size_combo.copy()
-    _single_size_combo["calls/sec"] = _single_size_combo["calls/sec"].apply(
-        lambda x: f"{x:,.02f}"
-    )
-    piv = _single_size_combo.pivot(["input", "func"], param_group, "calls/sec")
+    # _single_size_combo = single_size_combo.copy()
+    # _single_size_combo["calls/sec"] = _single_size_combo["calls/sec"].apply(
+    #
+    # )
+    time_piv = single_size_combo.pivot(["input", "func"], param_group, "mean_time")
+
+    hz_piv = (1 / time_piv)
+    # hzstr_piv = (1 / time_piv).applymap(lambda x: f"{x:,.02f}")
     print("Table for size=256")
-    print(piv)
+    # print(hzstr_piv.to_markdown())
+    print(hz_piv.to_markdown(floatfmt=',.02f'))
+    print("")
+    print("Above metrics are in call/sec, larger is better.")
+
+    speedup_piv = hz_piv / hz_piv['json'].values
+    print(speedup_piv.to_markdown(floatfmt=',.02g'))
 
     analysis.abalate(param_group)
     # benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL)
@@ -253,35 +235,46 @@ def aggregate_results(result_fpaths):
     group_labels = {
         "fig": ["input"],
         "col": ["func"],
+        # "fig": [],
+        # "col": ["func" "input"],
         "hue": ["impl", "impl_version"],
         "size": [],
     }
     import kwplot
-
     kwplot.autosns()
-    plots = analysis.plot(xlabel, metric_key, group_labels)
-    for plot in plots:
-        for ax in plot["facet"].axes.ravel():
-            ax.set_xscale("log")
-            ax.set_yscale("log")
+    plots = analysis.plot(
+        xlabel, metric_key, group_labels,
+        xscale='log', yscale='log',
+    )
+    plots
     kwplot.show_if_requested()
 
 
-def main():
-    from json_benchmarks import core
-
-    config = core.JSONBenchmarkConfig(cmdline=True)
+def main(cmdline=True, **kwargs):
+    """
+    Example:
+        >>> import sys, ubelt
+        >>> sys.path.append(ubelt.expandpath('~/code/ultrajson'))
+        >>> from json_benchmarks.core import *  # NOQA
+        >>> import kwplot
+        >>> kwplot.autosns()
+        >>> cmdline = False
+        >>> kwargs = {}
+        >>> main(cmdline, **kwargs)
+    """
+    config = JSONBenchmarkConfig(cmdline=cmdline, data=kwargs)
     dpath = config["cache_dir"]
 
-    run = 1
+    run = config['mode'] in {'all', 'single', 'run'}
     if run:
-        result_fpath = core.benchmark_json()
+        result_fpath = benchmark_json()
         print(f"result_fpath = {result_fpath!r}")
         result_fpaths = [result_fpath]
 
-    agg = 1
+    agg = config['mode'] not in {'single'}
     if agg:
         result_fpaths = list(dpath.glob("benchmarks*.json"))
 
-    core.aggregate_results(result_fpaths)
-    # results_output_table(libraries)
+    analyze = config['mode'] in {'all', 'single', 'analyze'}
+    if analyze:
+        analyze_results(result_fpaths)

From 470f440f3f0fd269e339978caa56a21e382f8379 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 29 May 2022 01:56:39 +0000
Subject: [PATCH 10/25] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 json_benchmarks/benchmarker/__init__.py       | 90 ++++++++++++-------
 json_benchmarks/benchmarker/benchmarker.py    |  1 +
 .../benchmarker/result_analysis.py            |  7 +-
 json_benchmarks/benchmarker/util_stats.py     | 88 +++++++++---------
 json_benchmarks/core.py                       | 31 ++++---
 5 files changed, 128 insertions(+), 89 deletions(-)

diff --git a/json_benchmarks/benchmarker/__init__.py b/json_benchmarks/benchmarker/__init__.py
index 5614b61..a278879 100644
--- a/json_benchmarks/benchmarker/__init__.py
+++ b/json_benchmarks/benchmarker/__init__.py
@@ -9,35 +9,65 @@ mkinit ~/code/ultrajson/json_benchmarks/benchmarker/__init__.py -w
 
 __version__ = "0.1.0"
 
-from json_benchmarks.benchmarker import aggregate
-from json_benchmarks.benchmarker import benchmarker
-from json_benchmarks.benchmarker import process_context
-from json_benchmarks.benchmarker import result_analysis
-from json_benchmarks.benchmarker import util_json
-from json_benchmarks.benchmarker import util_stats
-from json_benchmarks.benchmarker import visualize
-
-from json_benchmarks.benchmarker.aggregate import (demo, demo_data,)
-from json_benchmarks.benchmarker.benchmarker import (Benchmarker,
-                                                     BenchmarkerConfig,
-                                                     BenchmarkerResult,)
-from json_benchmarks.benchmarker.process_context import (ProcessContext,)
+from json_benchmarks.benchmarker import (
+    aggregate,
+    benchmarker,
+    process_context,
+    result_analysis,
+    util_json,
+    util_stats,
+    visualize,
+)
+from json_benchmarks.benchmarker.aggregate import demo, demo_data
+from json_benchmarks.benchmarker.benchmarker import (
+    Benchmarker,
+    BenchmarkerConfig,
+    BenchmarkerResult,
+)
+from json_benchmarks.benchmarker.process_context import ProcessContext
 from json_benchmarks.benchmarker.result_analysis import (
-    DEFAULT_METRIC_TO_OBJECTIVE, Result, ResultAnalysis, SkillTracker,)
-from json_benchmarks.benchmarker.util_json import (ensure_json_serializable,
-                                                   find_json_unserializable,
-                                                   indexable_allclose,)
-from json_benchmarks.benchmarker.util_stats import (aggregate_stats,
-                                                    combine_stats,
-                                                    combine_stats_arrs,
-                                                    stats_dict,)
-from json_benchmarks.benchmarker.visualize import (benchmark_analysis,)
+    DEFAULT_METRIC_TO_OBJECTIVE,
+    Result,
+    ResultAnalysis,
+    SkillTracker,
+)
+from json_benchmarks.benchmarker.util_json import (
+    ensure_json_serializable,
+    find_json_unserializable,
+    indexable_allclose,
+)
+from json_benchmarks.benchmarker.util_stats import (
+    aggregate_stats,
+    combine_stats,
+    combine_stats_arrs,
+    stats_dict,
+)
+from json_benchmarks.benchmarker.visualize import benchmark_analysis
 
-__all__ = ['Benchmarker', 'BenchmarkerConfig', 'BenchmarkerResult',
-           'DEFAULT_METRIC_TO_OBJECTIVE', 'ProcessContext', 'Result',
-           'ResultAnalysis', 'SkillTracker', 'aggregate', 'aggregate_stats',
-           'benchmark_analysis', 'benchmarker', 'combine_stats',
-           'combine_stats_arrs', 'demo', 'demo_data',
-           'ensure_json_serializable', 'find_json_unserializable',
-           'indexable_allclose', 'process_context', 'result_analysis',
-           'stats_dict', 'util_json', 'util_stats', 'visualize']
+__all__ = [
+    "Benchmarker",
+    "BenchmarkerConfig",
+    "BenchmarkerResult",
+    "DEFAULT_METRIC_TO_OBJECTIVE",
+    "ProcessContext",
+    "Result",
+    "ResultAnalysis",
+    "SkillTracker",
+    "aggregate",
+    "aggregate_stats",
+    "benchmark_analysis",
+    "benchmarker",
+    "combine_stats",
+    "combine_stats_arrs",
+    "demo",
+    "demo_data",
+    "ensure_json_serializable",
+    "find_json_unserializable",
+    "indexable_allclose",
+    "process_context",
+    "result_analysis",
+    "stats_dict",
+    "util_json",
+    "util_stats",
+    "visualize",
+]
diff --git a/json_benchmarks/benchmarker/benchmarker.py b/json_benchmarks/benchmarker/benchmarker.py
index 24859ed..7a0d4fa 100644
--- a/json_benchmarks/benchmarker/benchmarker.py
+++ b/json_benchmarks/benchmarker/benchmarker.py
@@ -159,6 +159,7 @@ class Benchmarker:
                 rows.append(row)
         else:
             from json_benchmarks.benchmarker import util_stats
+
             times = np.array(ti.robust_times())
             metrics = util_stats.stats_dict(times, "_time")
             row = {
diff --git a/json_benchmarks/benchmarker/result_analysis.py b/json_benchmarks/benchmarker/result_analysis.py
index 81865e2..846a046 100644
--- a/json_benchmarks/benchmarker/result_analysis.py
+++ b/json_benchmarks/benchmarker/result_analysis.py
@@ -822,6 +822,7 @@ class ResultAnalysis(ub.NiceRepr):
             >>> self.plot(xlabel, metric_key, group_labels, **kwargs)
         """
         import seaborn as sns
+
         sns.set()
         from matplotlib import pyplot as plt  # NOQA
 
@@ -911,9 +912,9 @@ class ResultAnalysis(ub.NiceRepr):
             plots.append(plot)
 
         for plot in plots:
-            xscale = kwargs.get('xscale', None)
-            yscale = kwargs.get('yscale', None)
-            for ax in plot['facet'].axes.ravel():
+            xscale = kwargs.get("xscale", None)
+            yscale = kwargs.get("yscale", None)
+            for ax in plot["facet"].axes.ravel():
                 if xscale is not None:
                     try:
                         ax.set_xscale(xscale)
diff --git a/json_benchmarks/benchmarker/util_stats.py b/json_benchmarks/benchmarker/util_stats.py
index 3d12965..2eaa32c 100644
--- a/json_benchmarks/benchmarker/util_stats.py
+++ b/json_benchmarks/benchmarker/util_stats.py
@@ -1,68 +1,68 @@
-import ubelt as ub
 import numpy as np
+import ubelt as ub
+
 
 def __tabulate_issue():
     # MWE for tabulate issue
     # The decimals are not aligned when using "," in the floatfmt
     import tabulate
+
     data = [
-        [13213.2, 3213254.23, 432432.231,],
-        [432432., 432.3, 3.2]
+        [
+            13213.2,
+            3213254.23,
+            432432.231,
+        ],
+        [432432.0, 432.3, 3.2],
     ]
-    print(tabulate.tabulate(data, headers=['a', 'b'], floatfmt=',.02f'))
-    print(tabulate.tabulate(data, headers=['a', 'b'], floatfmt='.02f'))
+    print(tabulate.tabulate(data, headers=["a", "b"], floatfmt=",.02f"))
+    print(tabulate.tabulate(data, headers=["a", "b"], floatfmt=".02f"))
 
 
 def __groupby_issue():
     # MWE of an issue with pandas groupby
     import pandas as pd
-    data = pd.DataFrame([
-        {'p1': 'a', 'p2': 1, 'p3': 0},
-        {'p1': 'a', 'p2': 1, 'p3': 0},
-        {'p1': 'a', 'p2': 2, 'p3': 0},
-        {'p1': 'b', 'p2': 2, 'p3': 0},
-        {'p1': 'b', 'p2': 1, 'p3': 0},
-        {'p1': 'b', 'p2': 1, 'p3': 0},
-        {'p1': 'b', 'p2': 1, 'p3': 0},
-    ])
 
-    by = 'p1'
+    data = pd.DataFrame(
+        [
+            {"p1": "a", "p2": 1, "p3": 0},
+            {"p1": "a", "p2": 1, "p3": 0},
+            {"p1": "a", "p2": 2, "p3": 0},
+            {"p1": "b", "p2": 2, "p3": 0},
+            {"p1": "b", "p2": 1, "p3": 0},
+            {"p1": "b", "p2": 1, "p3": 0},
+            {"p1": "b", "p2": 1, "p3": 0},
+        ]
+    )
+
+    by = "p1"
     key = list(data.groupby(by))[0][0]
-    result = {
-        'by': by,
-        'key': key,
-        'type(key)': type(key)
-    }
-    print('result = {}'.format(ub.repr2(result, nl=1)))
+    result = {"by": by, "key": key, "type(key)": type(key)}
+    print(f"result = {ub.repr2(result, nl=1)}")
+    assert not ub.iterable(
+        key
+    ), "`by` is specified as a scalar, so getting `key` as a scalar makes sense"
+
+    by = ["p1"]
+    key = list(data.groupby(by))[0][0]
+    result = {"by": by, "key": key, "type(key)": type(key)}
+    print(f"result = {ub.repr2(result, nl=1)}")
     assert not ub.iterable(key), (
-        '`by` is specified as a scalar, so getting `key` as a scalar makes sense')
+        "`by` is specified as a list of scalars (with one element), but we "
+        "still get `key` as a scalar. This does not make sense"
+    )
 
-    by = ['p1']
+    by = ["p1", "p2"]
     key = list(data.groupby(by))[0][0]
-    result = {
-        'by': by,
-        'key': key,
-        'type(key)': type(key)
-    }
-    print('result = {}'.format(ub.repr2(result, nl=1)))
-    assert not ub.iterable(key), (
-        '`by` is specified as a list of scalars (with one element), but we '
-        'still get `key` as a scalar. This does not make sense')
-
-    by = ['p1', 'p2']
-    key = list(data.groupby(by))[0][0]
-    result = {
-        'by': by,
-        'key': key,
-        'type(key)': type(key)
-    }
-    print('result = {}'.format(ub.repr2(result, nl=1)))
+    result = {"by": by, "key": key, "type(key)": type(key)}
+    print(f"result = {ub.repr2(result, nl=1)}")
     assert ub.iterable(key), (
-        '`by` is specified as a list of scalars (with multiple elements), '
-        'and we still get `key` as a tuple of values. This makes sense')
+        "`by` is specified as a list of scalars (with multiple elements), "
+        "and we still get `key` as a tuple of values. This makes sense"
+    )
 
 
-def aggregate_stats(data, suffix='', group_keys=None):
+def aggregate_stats(data, suffix="", group_keys=None):
     """
     Given columns interpreted as containing stats, aggregate those stats
     within each group. For each row, any non-group, non-stat column
diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py
index 18617c7..5fa57aa 100644
--- a/json_benchmarks/core.py
+++ b/json_benchmarks/core.py
@@ -40,9 +40,9 @@ class JSONBenchmarkConfig(scfg.Config):
 
                 In "analyze" mode, no benchmarks are run, but any existing
                 benchmarks are loaded for analysis and visualization.
-                """)
+                """
+            ),
         ),
-
         "disable": scfg.Value(
             [],
             choices=KNOWN_LIBRARIES,
@@ -82,6 +82,7 @@ class JSONBenchmarkConfig(scfg.Config):
 
 def available_json_impls():
     import importlib
+
     known_modnames = KNOWN_LIBRARIES
     json_impls = {}
     for libname in known_modnames:
@@ -206,7 +207,9 @@ def analyze_results(result_fpaths):
 
     single_size = table[(table["size"] == 256) | table["size"].isnull()]
     # single_size_combo = aggregate_stats(single_size, None)
-    single_size_combo = util_stats.aggregate_stats(single_size, suffix='_time', group_keys=["name"])
+    single_size_combo = util_stats.aggregate_stats(
+        single_size, suffix="_time", group_keys=["name"]
+    )
 
     param_group = ["impl", "impl_version"]
     single_size_combo["calls/sec"] = 1 / single_size_combo["mean_time"]
@@ -216,16 +219,16 @@ def analyze_results(result_fpaths):
     # )
     time_piv = single_size_combo.pivot(["input", "func"], param_group, "mean_time")
 
-    hz_piv = (1 / time_piv)
+    hz_piv = 1 / time_piv
     # hzstr_piv = (1 / time_piv).applymap(lambda x: f"{x:,.02f}")
     print("Table for size=256")
     # print(hzstr_piv.to_markdown())
-    print(hz_piv.to_markdown(floatfmt=',.02f'))
+    print(hz_piv.to_markdown(floatfmt=",.02f"))
     print("")
     print("Above metrics are in call/sec, larger is better.")
 
-    speedup_piv = hz_piv / hz_piv['json'].values
-    print(speedup_piv.to_markdown(floatfmt=',.02g'))
+    speedup_piv = hz_piv / hz_piv["json"].values
+    print(speedup_piv.to_markdown(floatfmt=",.02g"))
 
     analysis.abalate(param_group)
     # benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL)
@@ -241,10 +244,14 @@ def analyze_results(result_fpaths):
         "size": [],
     }
     import kwplot
+
     kwplot.autosns()
     plots = analysis.plot(
-        xlabel, metric_key, group_labels,
-        xscale='log', yscale='log',
+        xlabel,
+        metric_key,
+        group_labels,
+        xscale="log",
+        yscale="log",
     )
     plots
     kwplot.show_if_requested()
@@ -265,16 +272,16 @@ def main(cmdline=True, **kwargs):
     config = JSONBenchmarkConfig(cmdline=cmdline, data=kwargs)
     dpath = config["cache_dir"]
 
-    run = config['mode'] in {'all', 'single', 'run'}
+    run = config["mode"] in {"all", "single", "run"}
     if run:
         result_fpath = benchmark_json()
         print(f"result_fpath = {result_fpath!r}")
         result_fpaths = [result_fpath]
 
-    agg = config['mode'] not in {'single'}
+    agg = config["mode"] not in {"single"}
     if agg:
         result_fpaths = list(dpath.glob("benchmarks*.json"))
 
-    analyze = config['mode'] in {'all', 'single', 'analyze'}
+    analyze = config["mode"] in {"all", "single", "analyze"}
     if analyze:
         analyze_results(result_fpaths)

From 3b29b746e39c71157f77a67c2b429c9b392e3efa Mon Sep 17 00:00:00 2001
From: joncrall <jon.crall@kitware.com>
Date: Sat, 28 May 2022 22:12:02 -0400
Subject: [PATCH 11/25] wip

---
 json_benchmarks/benchmarker/result_analysis.py | 12 ++++++++++--
 json_benchmarks/core.py                        |  4 +++-
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/json_benchmarks/benchmarker/result_analysis.py b/json_benchmarks/benchmarker/result_analysis.py
index 81865e2..4bc52e1 100644
--- a/json_benchmarks/benchmarker/result_analysis.py
+++ b/json_benchmarks/benchmarker/result_analysis.py
@@ -821,12 +821,17 @@ class ResultAnalysis(ub.NiceRepr):
             >>> kwargs = {'xscale': 'log', 'yscale': 'log'}
             >>> self.plot(xlabel, metric_key, group_labels, **kwargs)
         """
+        print('Init seaborn and pyplot')
         import seaborn as sns
         sns.set()
         from matplotlib import pyplot as plt  # NOQA
 
+        print('Starting plot')
+
         data = self.table
         data = data.sort_values(metric_key)
+
+        print('Compute group labels')
         for gname, labels in group_labels.items():
             if len(labels):
                 new_col = []
@@ -880,6 +885,7 @@ class ResultAnalysis(ub.NiceRepr):
 
         plots = []
         base_fnum = 1
+        print('Start plots')
         for fnum, (fig_key, group) in enumerate(groups, start=base_fnum):
             # TODO: seaborn doesn't give us any option to reuse an existing
             # figure or even specify what it's handle should be. A patch should
@@ -891,8 +897,8 @@ class ResultAnalysis(ub.NiceRepr):
 
             facet = sns.relplot(
                 data=group,
-                # kind='line',
-                kind="scatter",
+                kind='line',
+                # kind="scatter",
                 facet_kws=facet_kws,
                 **plot_kws,
             )
@@ -910,6 +916,7 @@ class ResultAnalysis(ub.NiceRepr):
             }
             plots.append(plot)
 
+        print('Adjust plots')
         for plot in plots:
             xscale = kwargs.get('xscale', None)
             yscale = kwargs.get('yscale', None)
@@ -924,6 +931,7 @@ class ResultAnalysis(ub.NiceRepr):
                         ax.set_yscale(yscale)
                     except ValueError:
                         pass
+        print('Finish')
         return plots
 
 
diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py
index 18617c7..7ada55a 100644
--- a/json_benchmarks/core.py
+++ b/json_benchmarks/core.py
@@ -179,7 +179,7 @@ def analyze_results(result_fpaths):
     import json
 
     results = []
-    for fpath in result_fpaths:
+    for fpath in ub.ProgIter(result_fpaths, desc='load results'):
         data = json.loads(fpath.read_text())
         for row in data["rows"]:
             result = benchmarker.BenchmarkerResult.load(fpath)
@@ -242,6 +242,8 @@ def analyze_results(result_fpaths):
     }
     import kwplot
     kwplot.autosns()
+    self = analysis
+
     plots = analysis.plot(
         xlabel, metric_key, group_labels,
         xscale='log', yscale='log',

From 283b5e5f9ba1804d8f2f6e37c2655d4f59af4553 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 29 May 2022 02:12:17 +0000
Subject: [PATCH 12/25] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 json_benchmarks/benchmarker/result_analysis.py | 14 +++++++-------
 json_benchmarks/core.py                        |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/json_benchmarks/benchmarker/result_analysis.py b/json_benchmarks/benchmarker/result_analysis.py
index ade9f36..da9ce05 100644
--- a/json_benchmarks/benchmarker/result_analysis.py
+++ b/json_benchmarks/benchmarker/result_analysis.py
@@ -821,18 +821,18 @@ class ResultAnalysis(ub.NiceRepr):
             >>> kwargs = {'xscale': 'log', 'yscale': 'log'}
             >>> self.plot(xlabel, metric_key, group_labels, **kwargs)
         """
-        print('Init seaborn and pyplot')
+        print("Init seaborn and pyplot")
         import seaborn as sns
 
         sns.set()
         from matplotlib import pyplot as plt  # NOQA
 
-        print('Starting plot')
+        print("Starting plot")
 
         data = self.table
         data = data.sort_values(metric_key)
 
-        print('Compute group labels')
+        print("Compute group labels")
         for gname, labels in group_labels.items():
             if len(labels):
                 new_col = []
@@ -886,7 +886,7 @@ class ResultAnalysis(ub.NiceRepr):
 
         plots = []
         base_fnum = 1
-        print('Start plots')
+        print("Start plots")
         for fnum, (fig_key, group) in enumerate(groups, start=base_fnum):
             # TODO: seaborn doesn't give us any option to reuse an existing
             # figure or even specify what it's handle should be. A patch should
@@ -898,7 +898,7 @@ class ResultAnalysis(ub.NiceRepr):
 
             facet = sns.relplot(
                 data=group,
-                kind='line',
+                kind="line",
                 # kind="scatter",
                 facet_kws=facet_kws,
                 **plot_kws,
@@ -917,7 +917,7 @@ class ResultAnalysis(ub.NiceRepr):
             }
             plots.append(plot)
 
-        print('Adjust plots')
+        print("Adjust plots")
         for plot in plots:
             xscale = kwargs.get("xscale", None)
             yscale = kwargs.get("yscale", None)
@@ -932,7 +932,7 @@ class ResultAnalysis(ub.NiceRepr):
                         ax.set_yscale(yscale)
                     except ValueError:
                         pass
-        print('Finish')
+        print("Finish")
         return plots
 
 
diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py
index 509e0c2..69b38c0 100644
--- a/json_benchmarks/core.py
+++ b/json_benchmarks/core.py
@@ -180,7 +180,7 @@ def analyze_results(result_fpaths):
     import json
 
     results = []
-    for fpath in ub.ProgIter(result_fpaths, desc='load results'):
+    for fpath in ub.ProgIter(result_fpaths, desc="load results"):
         data = json.loads(fpath.read_text())
         for row in data["rows"]:
             result = benchmarker.BenchmarkerResult.load(fpath)

From 03ae1b85459dd234dec3803debbd9aa6061fb89e Mon Sep 17 00:00:00 2001
From: joncrall <jon.crall@kitware.com>
Date: Sat, 28 May 2022 23:04:04 -0400
Subject: [PATCH 13/25] use aggregate mean std to plot errors

---
 .../benchmarker/result_analysis.py            | 95 +++++++++++++++----
 json_benchmarks/core.py                       |  7 +-
 2 files changed, 83 insertions(+), 19 deletions(-)

diff --git a/json_benchmarks/benchmarker/result_analysis.py b/json_benchmarks/benchmarker/result_analysis.py
index da9ce05..9f1730a 100644
--- a/json_benchmarks/benchmarker/result_analysis.py
+++ b/json_benchmarks/benchmarker/result_analysis.py
@@ -788,7 +788,7 @@ class ResultAnalysis(ub.NiceRepr):
                 conclusions.append(txt)
         return conclusions
 
-    def plot(self, xlabel, metric_key, group_labels, **kwargs):
+    def plot(self, xlabel, metric_key, group_labels, data=None, **kwargs):
         """
         Args:
             group_labels (dict):
@@ -829,7 +829,8 @@ class ResultAnalysis(ub.NiceRepr):
 
         print("Starting plot")
 
-        data = self.table
+        if data is None:
+            data = self.table
         data = data.sort_values(metric_key)
 
         print("Compute group labels")
@@ -884,10 +885,15 @@ class ResultAnalysis(ub.NiceRepr):
         palette = ub.dzip(unique_hues, sns.color_palette(n_colors=len(unique_hues)))
         plot_kws["palette"] = palette
 
+        # kwplot.close_figures()
+
         plots = []
         base_fnum = 1
         print("Start plots")
-        for fnum, (fig_key, group) in enumerate(groups, start=base_fnum):
+        # hack
+        hack_groups = [(k, v) for k, v in groups if k != "input=Complex object"]
+
+        for fnum, (fig_key, group) in enumerate(hack_groups, start=base_fnum):
             # TODO: seaborn doesn't give us any option to reuse an existing
             # figure or even specify what it's handle should be. A patch should
             # be submitted to add that feature, but in the meantime work around
@@ -903,6 +909,50 @@ class ResultAnalysis(ub.NiceRepr):
                 facet_kws=facet_kws,
                 **plot_kws,
             )
+            from json_benchmarks.benchmarker.util_stats import aggregate_stats
+
+            facet_data_groups = dict(list(facet.data.groupby(facet._col_var)))
+            # facet_data_group_iter = iter(facet_data_groups.keys())
+
+            for ax in facet.axes.ravel():
+                col_key = ax.get_title().split('=', 1)[-1].strip()
+                # col_key = next(facet_data_group_iter)
+                col_data = facet_data_groups[col_key]
+                col_data['mean_time']
+                col_data['std_time']
+                xlabel = plot_kws['x']
+                ylabel = plot_kws['y']
+                subgroups = col_data.groupby(plot_kws['hue'])
+                for subgroup_key, subgroup in subgroups:
+                    # combine stds in multiple groups on the x and manually draw errors
+                    suffix = '_' + ylabel.partition('_')[2]
+                    if 'mean_' in ylabel:
+                        std_label = ylabel.replace('mean_', 'std_')
+                        combo_group = aggregate_stats(subgroup, suffix=suffix, group_keys=[plot_kws['x']])
+                        _xdata = combo_group[xlabel].values
+                        _ydata_mean = combo_group[ylabel].values
+                        _ydata_std = combo_group[std_label].values
+                        std_label = ylabel.replace('mean_', 'std_')
+                        y_data_min = _ydata_mean - _ydata_std
+                        y_data_max = _ydata_mean + _ydata_std
+                        spread_alpha = 0.3
+                        color = palette[subgroup_key]
+                        ax.fill_between(_xdata, y_data_min, y_data_max, alpha=spread_alpha, color=color, zorder=1)
+                    # zorder=0)
+
+            xscale = kwargs.get("xscale", None)
+            yscale = kwargs.get("yscale", None)
+            for ax in facet.axes.ravel():
+                if xscale is not None:
+                    try:
+                        ax.set_xscale(xscale)
+                    except ValueError:
+                        pass
+                if yscale is not None:
+                    try:
+                        ax.set_yscale(yscale)
+                    except ValueError:
+                        pass
 
             fig = facet.figure
             fig.suptitle(fig_key)
@@ -917,21 +967,30 @@ class ResultAnalysis(ub.NiceRepr):
             }
             plots.append(plot)
 
-        print("Adjust plots")
-        for plot in plots:
-            xscale = kwargs.get("xscale", None)
-            yscale = kwargs.get("yscale", None)
-            for ax in plot["facet"].axes.ravel():
-                if xscale is not None:
-                    try:
-                        ax.set_xscale(xscale)
-                    except ValueError:
-                        pass
-                if yscale is not None:
-                    try:
-                        ax.set_yscale(yscale)
-                    except ValueError:
-                        pass
+            # if fnum >= 1:
+            #     break
+
+        # print("Adjust plots")
+        # for plot in plots:
+        #     xscale = kwargs.get("xscale", None)
+        #     yscale = kwargs.get("yscale", None)
+        #     facet = plot["facet"]
+
+        #     facet_data_groups = dict(list(facet.data.groupby(facet._col_var)))
+        #     facet_data_group_iter = iter(facet_data_groups.keys())
+
+        #     for ax in facet.axes.ravel():
+
+        #         if xscale is not None:
+        #             try:
+        #                 ax.set_xscale(xscale)
+        #             except ValueError:
+        #                 pass
+        #         if yscale is not None:
+        #             try:
+        #                 ax.set_yscale(yscale)
+        #             except ValueError:
+        #                 pass
         print("Finish")
         return plots
 
diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py
index 69b38c0..a9c6d2d 100644
--- a/json_benchmarks/core.py
+++ b/json_benchmarks/core.py
@@ -204,8 +204,11 @@ def analyze_results(result_fpaths):
     analysis.analysis()
 
     table = analysis.table
+    stats_table = util_stats.aggregate_stats(
+        table, suffix="_time", group_keys=["name"]
+    )
 
-    single_size = table[(table["size"] == 256) | table["size"].isnull()]
+    single_size = stats_table[(stats_table["size"] == 256) | stats_table["size"].isnull()]
     # single_size_combo = aggregate_stats(single_size, None)
     single_size_combo = util_stats.aggregate_stats(
         single_size, suffix="_time", group_keys=["name"]
@@ -248,12 +251,14 @@ def analyze_results(result_fpaths):
     kwplot.autosns()
     self = analysis
 
+    data = stats_table
     plots = analysis.plot(
         xlabel,
         metric_key,
         group_labels,
         xscale="log",
         yscale="log",
+        data=data,
     )
     plots
     kwplot.show_if_requested()

From eee2a5ff66e0d0950a7d176c1407b60364bea100 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 29 May 2022 03:04:18 +0000
Subject: [PATCH 14/25] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../benchmarker/result_analysis.py            | 33 ++++++++++++-------
 json_benchmarks/core.py                       |  8 ++---
 2 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/json_benchmarks/benchmarker/result_analysis.py b/json_benchmarks/benchmarker/result_analysis.py
index 9f1730a..2702ecd 100644
--- a/json_benchmarks/benchmarker/result_analysis.py
+++ b/json_benchmarks/benchmarker/result_analysis.py
@@ -915,29 +915,38 @@ class ResultAnalysis(ub.NiceRepr):
             # facet_data_group_iter = iter(facet_data_groups.keys())
 
             for ax in facet.axes.ravel():
-                col_key = ax.get_title().split('=', 1)[-1].strip()
+                col_key = ax.get_title().split("=", 1)[-1].strip()
                 # col_key = next(facet_data_group_iter)
                 col_data = facet_data_groups[col_key]
-                col_data['mean_time']
-                col_data['std_time']
-                xlabel = plot_kws['x']
-                ylabel = plot_kws['y']
-                subgroups = col_data.groupby(plot_kws['hue'])
+                col_data["mean_time"]
+                col_data["std_time"]
+                xlabel = plot_kws["x"]
+                ylabel = plot_kws["y"]
+                subgroups = col_data.groupby(plot_kws["hue"])
                 for subgroup_key, subgroup in subgroups:
                     # combine stds in multiple groups on the x and manually draw errors
-                    suffix = '_' + ylabel.partition('_')[2]
-                    if 'mean_' in ylabel:
-                        std_label = ylabel.replace('mean_', 'std_')
-                        combo_group = aggregate_stats(subgroup, suffix=suffix, group_keys=[plot_kws['x']])
+                    suffix = "_" + ylabel.partition("_")[2]
+                    if "mean_" in ylabel:
+                        std_label = ylabel.replace("mean_", "std_")
+                        combo_group = aggregate_stats(
+                            subgroup, suffix=suffix, group_keys=[plot_kws["x"]]
+                        )
                         _xdata = combo_group[xlabel].values
                         _ydata_mean = combo_group[ylabel].values
                         _ydata_std = combo_group[std_label].values
-                        std_label = ylabel.replace('mean_', 'std_')
+                        std_label = ylabel.replace("mean_", "std_")
                         y_data_min = _ydata_mean - _ydata_std
                         y_data_max = _ydata_mean + _ydata_std
                         spread_alpha = 0.3
                         color = palette[subgroup_key]
-                        ax.fill_between(_xdata, y_data_min, y_data_max, alpha=spread_alpha, color=color, zorder=1)
+                        ax.fill_between(
+                            _xdata,
+                            y_data_min,
+                            y_data_max,
+                            alpha=spread_alpha,
+                            color=color,
+                            zorder=1,
+                        )
                     # zorder=0)
 
             xscale = kwargs.get("xscale", None)
diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py
index a9c6d2d..0fa7969 100644
--- a/json_benchmarks/core.py
+++ b/json_benchmarks/core.py
@@ -204,11 +204,11 @@ def analyze_results(result_fpaths):
     analysis.analysis()
 
     table = analysis.table
-    stats_table = util_stats.aggregate_stats(
-        table, suffix="_time", group_keys=["name"]
-    )
+    stats_table = util_stats.aggregate_stats(table, suffix="_time", group_keys=["name"])
 
-    single_size = stats_table[(stats_table["size"] == 256) | stats_table["size"].isnull()]
+    single_size = stats_table[
+        (stats_table["size"] == 256) | stats_table["size"].isnull()
+    ]
     # single_size_combo = aggregate_stats(single_size, None)
     single_size_combo = util_stats.aggregate_stats(
         single_size, suffix="_time", group_keys=["name"]

From bd592fdd3bb4c9cf1706d9ab58ed74ea2ff7563d Mon Sep 17 00:00:00 2001
From: joncrall <jon.crall@kitware.com>
Date: Sun, 29 May 2022 13:26:59 -0400
Subject: [PATCH 15/25] Refactor core into measures and analysis submodules

---
 json_benchmarks/analysis.py                   | 112 +++++++++
 json_benchmarks/benchmarker/__init__.py       |  90 +++----
 json_benchmarks/benchmarker/aggregate.py      |  74 ------
 json_benchmarks/benchmarker/benchmarker.py    |  62 +++++
 .../benchmarker/result_analysis.py            |   8 +-
 json_benchmarks/core.py                       | 225 +-----------------
 json_benchmarks/libraries.py                  |  67 ++++++
 json_benchmarks/measures.py                   | 126 ++++++++++
 8 files changed, 409 insertions(+), 355 deletions(-)
 create mode 100644 json_benchmarks/analysis.py
 delete mode 100644 json_benchmarks/benchmarker/aggregate.py
 create mode 100644 json_benchmarks/libraries.py
 create mode 100644 json_benchmarks/measures.py

diff --git a/json_benchmarks/analysis.py b/json_benchmarks/analysis.py
new file mode 100644
index 0000000..acda85e
--- /dev/null
+++ b/json_benchmarks/analysis.py
@@ -0,0 +1,112 @@
+"""
+The analysis of the measurements
+"""
+import scriptconfig as scfg
+import ubelt as ub
+
+
+class AnalysisConfig(scfg.Config):
+    default = {
+        "cache_dir": scfg.Value(
+            None,
+            help=ub.paragraph(
+                """
+                Location for benchmark cache.
+                Defaults to $XDG_CACHE/ujson/benchmark_results/
+                """
+            ),
+        ),
+    }
+
+    def normalize(self):
+        dpath = self["cache_dir"]
+        if dpath is None:
+            dpath = ub.Path.appdir("ujson/benchmark_results")
+        dpath = ub.Path(dpath)
+        self["cache_dir"] = dpath
+
+
+def analyze_results(result_fpaths):
+    from json_benchmarks.benchmarker import util_stats
+    from json_benchmarks import benchmarker
+    import json
+
+    results = []
+    for fpath in ub.ProgIter(result_fpaths, desc="load results"):
+        data = json.loads(fpath.read_text())
+        for row in data["rows"]:
+            result = benchmarker.BenchmarkerResult.load(fpath)
+            results.extend(result.to_result_list())
+
+    RECORD_ALL = 0
+    metric_key = "time" if RECORD_ALL else "mean_time"
+
+    # results = benchmark.result.to_result_list()
+
+    analysis = benchmarker.result_analysis.ResultAnalysis(
+        results,
+        metrics=[metric_key],
+        params=["impl"],
+        metric_objectives={
+            "min_time": "min",
+            "mean_time": "min",
+            "time": "min",
+        },
+    )
+    analysis.analysis()
+
+    table = analysis.table
+    stats_table = util_stats.aggregate_stats(table, suffix="_time", group_keys=["name"])
+
+    single_size = stats_table[
+        (stats_table["size"] == 256) | stats_table["size"].isnull()
+    ]
+    # single_size_combo = aggregate_stats(single_size, None)
+    single_size_combo = util_stats.aggregate_stats(
+        single_size, suffix="_time", group_keys=["name"]
+    )
+
+    param_group = ["impl", "impl_version"]
+    single_size_combo["calls/sec"] = 1 / single_size_combo["mean_time"]
+    # _single_size_combo = single_size_combo.copy()
+    time_piv = single_size_combo.pivot(["input", "func"], param_group, "mean_time")
+
+    hz_piv = 1 / time_piv
+    # hzstr_piv = (1 / time_piv).applymap(lambda x: f"{x:,.02f}")
+    print("Table for size=256")
+    # print(hzstr_piv.to_markdown())
+    print(hz_piv.to_markdown(floatfmt=",.02f"))
+    print("")
+    print("Above metrics are in call/sec, larger is better.")
+
+    speedup_piv = hz_piv / hz_piv["json"].values
+    print(speedup_piv.to_markdown(floatfmt=",.02g"))
+
+    analysis.abalate(param_group)
+    # benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL)
+
+    xlabel = "size"
+    # Set these to empty lists if they are not used
+    group_labels = {
+        "fig": ["input"],
+        "col": ["func"],
+        # "fig": [],
+        # "col": ["func" "input"],
+        "hue": ["impl", "impl_version"],
+        "size": [],
+    }
+    import kwplot
+    kwplot.autosns()
+    self = analysis  # NOQA
+
+    data = stats_table
+    plots = analysis.plot(
+        xlabel,
+        metric_key,
+        group_labels,
+        xscale="log",
+        yscale="log",
+        data=data,
+    )
+    plots
+    kwplot.show_if_requested()
diff --git a/json_benchmarks/benchmarker/__init__.py b/json_benchmarks/benchmarker/__init__.py
index a278879..aa42063 100644
--- a/json_benchmarks/benchmarker/__init__.py
+++ b/json_benchmarks/benchmarker/__init__.py
@@ -9,65 +9,33 @@ mkinit ~/code/ultrajson/json_benchmarks/benchmarker/__init__.py -w
 
 __version__ = "0.1.0"
 
-from json_benchmarks.benchmarker import (
-    aggregate,
-    benchmarker,
-    process_context,
-    result_analysis,
-    util_json,
-    util_stats,
-    visualize,
-)
-from json_benchmarks.benchmarker.aggregate import demo, demo_data
-from json_benchmarks.benchmarker.benchmarker import (
-    Benchmarker,
-    BenchmarkerConfig,
-    BenchmarkerResult,
-)
-from json_benchmarks.benchmarker.process_context import ProcessContext
-from json_benchmarks.benchmarker.result_analysis import (
-    DEFAULT_METRIC_TO_OBJECTIVE,
-    Result,
-    ResultAnalysis,
-    SkillTracker,
-)
-from json_benchmarks.benchmarker.util_json import (
-    ensure_json_serializable,
-    find_json_unserializable,
-    indexable_allclose,
-)
-from json_benchmarks.benchmarker.util_stats import (
-    aggregate_stats,
-    combine_stats,
-    combine_stats_arrs,
-    stats_dict,
-)
-from json_benchmarks.benchmarker.visualize import benchmark_analysis
+from json_benchmarks.benchmarker import benchmarker
+from json_benchmarks.benchmarker import process_context
+from json_benchmarks.benchmarker import result_analysis
+from json_benchmarks.benchmarker import util_json
+from json_benchmarks.benchmarker import util_stats
+from json_benchmarks.benchmarker import visualize
 
-__all__ = [
-    "Benchmarker",
-    "BenchmarkerConfig",
-    "BenchmarkerResult",
-    "DEFAULT_METRIC_TO_OBJECTIVE",
-    "ProcessContext",
-    "Result",
-    "ResultAnalysis",
-    "SkillTracker",
-    "aggregate",
-    "aggregate_stats",
-    "benchmark_analysis",
-    "benchmarker",
-    "combine_stats",
-    "combine_stats_arrs",
-    "demo",
-    "demo_data",
-    "ensure_json_serializable",
-    "find_json_unserializable",
-    "indexable_allclose",
-    "process_context",
-    "result_analysis",
-    "stats_dict",
-    "util_json",
-    "util_stats",
-    "visualize",
-]
+from json_benchmarks.benchmarker.benchmarker import (Benchmarker,
+                                                     BenchmarkerConfig,
+                                                     BenchmarkerResult,)
+from json_benchmarks.benchmarker.process_context import (ProcessContext,)
+from json_benchmarks.benchmarker.result_analysis import (
+    DEFAULT_METRIC_TO_OBJECTIVE, Result, ResultAnalysis, SkillTracker,)
+from json_benchmarks.benchmarker.util_json import (ensure_json_serializable,
+                                                   find_json_unserializable,
+                                                   indexable_allclose,)
+from json_benchmarks.benchmarker.util_stats import (aggregate_stats,
+                                                    combine_stats,
+                                                    combine_stats_arrs,
+                                                    stats_dict,)
+from json_benchmarks.benchmarker.visualize import (benchmark_analysis,)
+
+__all__ = ['Benchmarker', 'BenchmarkerConfig', 'BenchmarkerResult',
+           'DEFAULT_METRIC_TO_OBJECTIVE', 'ProcessContext', 'Result',
+           'ResultAnalysis', 'SkillTracker', 'aggregate_stats',
+           'benchmark_analysis', 'benchmarker', 'combine_stats',
+           'combine_stats_arrs', 'ensure_json_serializable',
+           'find_json_unserializable', 'indexable_allclose', 'process_context',
+           'result_analysis', 'stats_dict', 'util_json', 'util_stats',
+           'visualize']
diff --git a/json_benchmarks/benchmarker/aggregate.py b/json_benchmarks/benchmarker/aggregate.py
deleted file mode 100644
index bba5771..0000000
--- a/json_benchmarks/benchmarker/aggregate.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import json
-
-import pandas as pd
-import ubelt as ub
-
-
-def demo_data():
-    import numpy as np
-
-    from json_benchmarks.benchmarker.benchmarker import Benchmarker
-
-    impl_lut = {
-        "numpy": np.sum,
-        "builtin": sum,
-    }
-
-    def data_lut(params):
-        item = 42 if params["dtype"] == "int" else 42.0
-        data = [item] * params["size"]
-        return data
-
-    basis = {
-        "impl": ["builtin", "numpy"],
-        "size": [10, 10000],
-        "dtype": ["int", "float"],
-    }
-
-    dpath = ub.Path.appdir("benchmarker/agg_demo").delete().ensuredir()
-
-    def run_one_benchmark():
-        self = Benchmarker(name="agg_demo", num=10, bestof=3, basis=basis)
-        for params in self.iter_params():
-            impl = impl_lut[params["impl"]]
-            data = data_lut(params)
-            for timer in self.measure():
-                with timer:
-                    impl(data)
-        fpath = self.dump_in_dpath(dpath)
-        return fpath
-
-    # Run the benchmark multiple times
-    fpaths = []
-    for _ in range(5):
-        fpath = run_one_benchmark()
-        fpaths.append(fpath)
-
-    return fpaths
-
-
-def demo():
-    from json_benchmarks.benchmarker import BenchmarkerResult, result_analysis
-
-    fpaths = demo_data()
-
-    results = []
-    for fpath in fpaths:
-        data = json.loads(fpath.read_text())
-        for row in data["rows"]:
-            result = BenchmarkerResult.load(fpath)
-            results.extend(result.to_result_list())
-
-    analysis = result_analysis.ResultAnalysis(
-        results,
-        metrics=["min", "mean"],
-        params=["impl"],
-        metric_objectives={
-            "min": "min",
-            "mean": "min",
-        },
-    )
-    analysis.analysis()
-    # single_df = pd.DataFrame(data['rows'])
-    # context = data['context']
-    # single_df
diff --git a/json_benchmarks/benchmarker/benchmarker.py b/json_benchmarks/benchmarker/benchmarker.py
index 7a0d4fa..ac53372 100644
--- a/json_benchmarks/benchmarker/benchmarker.py
+++ b/json_benchmarks/benchmarker/benchmarker.py
@@ -168,3 +168,65 @@ class Benchmarker:
                 "name": key,
             }
             rows.append(row)
+
+
+def _test_demo():
+    from json_benchmarks.benchmarker import BenchmarkerResult, result_analysis
+    from json_benchmarks.benchmarker.benchmarker import Benchmarker
+    import numpy as np
+
+    impl_lut = {
+        "numpy": np.sum,
+        "builtin": sum,
+    }
+
+    def data_lut(params):
+        item = 42 if params["dtype"] == "int" else 42.0
+        data = [item] * params["size"]
+        return data
+
+    basis = {
+        "impl": ["builtin", "numpy"],
+        "size": [10, 10000],
+        "dtype": ["int", "float"],
+    }
+
+    dpath = ub.Path.appdir("benchmarker/agg_demo").delete().ensuredir()
+
+    def run_one_benchmark():
+        self = Benchmarker(name="agg_demo", num=10, bestof=3, basis=basis)
+        for params in self.iter_params():
+            impl = impl_lut[params["impl"]]
+            data = data_lut(params)
+            for timer in self.measure():
+                with timer:
+                    impl(data)
+        fpath = self.dump_in_dpath(dpath)
+        return fpath
+
+    # Run the benchmark multiple times
+    fpaths = []
+    for _ in range(5):
+        fpath = run_one_benchmark()
+        fpaths.append(fpath)
+
+    results = []
+    for fpath in fpaths:
+        data = json.loads(fpath.read_text())
+        for row in data["rows"]:
+            result = BenchmarkerResult.load(fpath)
+            results.extend(result.to_result_list())
+
+    analysis = result_analysis.ResultAnalysis(
+        results,
+        metrics=["min", "mean"],
+        params=["impl"],
+        metric_objectives={
+            "min": "min",
+            "mean": "min",
+        },
+    )
+    analysis.analysis()
+    # single_df = pd.DataFrame(data['rows'])
+    # context = data['context']
+    # single_df
diff --git a/json_benchmarks/benchmarker/result_analysis.py b/json_benchmarks/benchmarker/result_analysis.py
index 2702ecd..108f3de 100644
--- a/json_benchmarks/benchmarker/result_analysis.py
+++ b/json_benchmarks/benchmarker/result_analysis.py
@@ -935,8 +935,12 @@ class ResultAnalysis(ub.NiceRepr):
                         _ydata_mean = combo_group[ylabel].values
                         _ydata_std = combo_group[std_label].values
                         std_label = ylabel.replace("mean_", "std_")
-                        y_data_min = _ydata_mean - _ydata_std
-                        y_data_max = _ydata_mean + _ydata_std
+
+                        # Plot bars 3 standard deviations from the mean to
+                        # get a 99.7% interval
+                        num_std = 3
+                        y_data_min = _ydata_mean - num_std * _ydata_std
+                        y_data_max = _ydata_mean + num_std * _ydata_std
                         spread_alpha = 0.3
                         color = palette[subgroup_key]
                         ax.fill_between(
diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py
index 0fa7969..d6103be 100644
--- a/json_benchmarks/core.py
+++ b/json_benchmarks/core.py
@@ -1,24 +1,14 @@
 """
 Main definition of the benchmarks
 """
-import json
-
 import scriptconfig as scfg
 import ubelt as ub
 
-from json_benchmarks import benchmarker, datagen
-from json_benchmarks.benchmarker import util_stats
-
-KNOWN_LIBRARIES = [
-    "ujson",
-    "nujson",
-    "orjson",
-    "simplejson",
-    "json",
-]
+from json_benchmarks import measures
+from json_benchmarks import analysis
 
 
-class JSONBenchmarkConfig(scfg.Config):
+class CoreConfig(scfg.Config):
     """
     Benchmark JSON implementations
     """
@@ -43,24 +33,7 @@ class JSONBenchmarkConfig(scfg.Config):
                 """
             ),
         ),
-        "disable": scfg.Value(
-            [],
-            choices=KNOWN_LIBRARIES,
-            help=ub.paragraph(
-                """
-                Remove specified libraries from the benchmarks
-                """
-            ),
-        ),
-        "factor": scfg.Value(
-            1.0,
-            help=ub.paragraph(
-                """
-                Specify as a fraction to speed up benchmarks for development /
-                testing
-                """
-            ),
-        ),
+
         "cache_dir": scfg.Value(
             None,
             help=ub.paragraph(
@@ -80,190 +53,6 @@ class JSONBenchmarkConfig(scfg.Config):
         self["cache_dir"] = dpath
 
 
-def available_json_impls():
-    import importlib
-
-    known_modnames = KNOWN_LIBRARIES
-    json_impls = {}
-    for libname in known_modnames:
-        try:
-            module = importlib.import_module(libname)
-        except ImportError:
-            pass
-        else:
-            json_impls[libname] = {
-                "module": module,
-                "version": module.__version__,
-            }
-    return json_impls
-
-
-def benchmark_json():
-    json_impls = available_json_impls()
-
-    data_lut = datagen.json_test_data_generators()
-
-    # These are the parameters that we benchmark over
-    common_basis = {
-        "impl": list(json_impls.keys()),
-        "func": ["dumps", "loads"],
-    }
-    sized_basis = {
-        "input": [
-            "Array with doubles",
-            "Array with UTF-8 strings",
-            # 'Medium complex object',
-            "Array with True values",
-            "Array of Dict[str, int]",
-            # 'Dict of List[Dict[str, int]]',
-            # 'Complex object'
-        ],
-        "size": [1, 2, 4, 8, 16, 32, 128, 256, 512],
-        # 1024, 2048, 4096, 8192, 12288],
-    }
-    predefined_basis = {
-        "input": ["Complex object"],
-        "size": [None],
-    }
-
-    basis = [
-        ub.dict_union(common_basis, predefined_basis),
-        ub.dict_union(common_basis, sized_basis),
-    ]
-
-    # The Benchmarker class is a new experimental API around timerit to
-    # abstract away the details of timing a process over a grid of parameters,
-    # serializing the results, and aggregating results from disparate runs.
-    benchmark = benchmarker.Benchmarker(
-        name="bench_json",
-        num=1000,
-        bestof=100,
-        verbose=3,
-        basis=basis,
-    )
-
-    def is_blocked(params):
-        if params["input"] == "Complex object" and params["impl"] == "orjson":
-            return True
-
-    # For each variation of your experiment, create a row.
-    for params in benchmark.iter_params():
-        if is_blocked(params):
-            continue
-        # Make any modifications you need to compute input kwargs for each
-        # method here.
-        impl_info = json_impls[params["impl"]]
-        params["impl_version"] = impl_info["version"]
-        module = impl_info["module"]
-        if params["func"] == "dumps":
-            method = module.dumps
-            data = data_lut[params["input"]](params["size"])
-        elif params["func"] == "loads":
-            method = module.loads
-            to_encode = data_lut[params["input"]](params["size"])
-            data = json.dumps(to_encode)
-        # Timerit will run some user-specified number of loops.
-        # and compute time stats with similar methodology to timeit
-        for timer in benchmark.measure():
-            # Put any setup logic you dont want to time here.
-            # ...
-            with timer:
-                # Put the logic you want to time here
-                method(data)
-
-    dpath = ub.Path.appdir("ujson/benchmark_results").ensuredir()
-    result_fpath = benchmark.dump_in_dpath(dpath)
-    return result_fpath
-
-
-def analyze_results(result_fpaths):
-    import json
-
-    results = []
-    for fpath in ub.ProgIter(result_fpaths, desc="load results"):
-        data = json.loads(fpath.read_text())
-        for row in data["rows"]:
-            result = benchmarker.BenchmarkerResult.load(fpath)
-            results.extend(result.to_result_list())
-
-    RECORD_ALL = 0
-    metric_key = "time" if RECORD_ALL else "mean_time"
-
-    # results = benchmark.result.to_result_list()
-
-    analysis = benchmarker.result_analysis.ResultAnalysis(
-        results,
-        metrics=[metric_key],
-        params=["impl"],
-        metric_objectives={
-            "min_time": "min",
-            "mean_time": "min",
-            "time": "min",
-        },
-    )
-    analysis.analysis()
-
-    table = analysis.table
-    stats_table = util_stats.aggregate_stats(table, suffix="_time", group_keys=["name"])
-
-    single_size = stats_table[
-        (stats_table["size"] == 256) | stats_table["size"].isnull()
-    ]
-    # single_size_combo = aggregate_stats(single_size, None)
-    single_size_combo = util_stats.aggregate_stats(
-        single_size, suffix="_time", group_keys=["name"]
-    )
-
-    param_group = ["impl", "impl_version"]
-    single_size_combo["calls/sec"] = 1 / single_size_combo["mean_time"]
-    # _single_size_combo = single_size_combo.copy()
-    # _single_size_combo["calls/sec"] = _single_size_combo["calls/sec"].apply(
-    #
-    # )
-    time_piv = single_size_combo.pivot(["input", "func"], param_group, "mean_time")
-
-    hz_piv = 1 / time_piv
-    # hzstr_piv = (1 / time_piv).applymap(lambda x: f"{x:,.02f}")
-    print("Table for size=256")
-    # print(hzstr_piv.to_markdown())
-    print(hz_piv.to_markdown(floatfmt=",.02f"))
-    print("")
-    print("Above metrics are in call/sec, larger is better.")
-
-    speedup_piv = hz_piv / hz_piv["json"].values
-    print(speedup_piv.to_markdown(floatfmt=",.02g"))
-
-    analysis.abalate(param_group)
-    # benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL)
-
-    xlabel = "size"
-    # Set these to empty lists if they are not used
-    group_labels = {
-        "fig": ["input"],
-        "col": ["func"],
-        # "fig": [],
-        # "col": ["func" "input"],
-        "hue": ["impl", "impl_version"],
-        "size": [],
-    }
-    import kwplot
-
-    kwplot.autosns()
-    self = analysis
-
-    data = stats_table
-    plots = analysis.plot(
-        xlabel,
-        metric_key,
-        group_labels,
-        xscale="log",
-        yscale="log",
-        data=data,
-    )
-    plots
-    kwplot.show_if_requested()
-
-
 def main(cmdline=True, **kwargs):
     """
     Example:
@@ -276,12 +65,12 @@ def main(cmdline=True, **kwargs):
         >>> kwargs = {}
         >>> main(cmdline, **kwargs)
     """
-    config = JSONBenchmarkConfig(cmdline=cmdline, data=kwargs)
+    config = CoreConfig(cmdline=cmdline, data=kwargs)
     dpath = config["cache_dir"]
 
     run = config["mode"] in {"all", "single", "run"}
     if run:
-        result_fpath = benchmark_json()
+        result_fpath = measures.benchmark_json()
         print(f"result_fpath = {result_fpath!r}")
         result_fpaths = [result_fpath]
 
@@ -291,4 +80,4 @@ def main(cmdline=True, **kwargs):
 
     analyze = config["mode"] in {"all", "single", "analyze"}
     if analyze:
-        analyze_results(result_fpaths)
+        analysis.analyze_results(result_fpaths)
diff --git a/json_benchmarks/libraries.py b/json_benchmarks/libraries.py
new file mode 100644
index 0000000..d7a8550
--- /dev/null
+++ b/json_benchmarks/libraries.py
@@ -0,0 +1,67 @@
+"""
+Define the json libraries we are considering
+"""
+
+KNOWN_LIBRARIES = [
+    {'modname': "ujson", 'distname': 'ujson'},
+    {'modname': "nujson", 'distname': 'nujson'},
+    {'modname': "orjson", 'distname': 'orjson'},
+    {'modname': "simplejson", 'distname': 'simplejson'},
+    {'modname': "json", 'distname': "<stdlib>"},
+    {'modname': "simdjson", 'distname': 'pysimdjson'},
+]
+
+KNOWN_MODNAMES = [info['modname'] for info in KNOWN_LIBRARIES]
+
+
+# TODO:
+# def distname_to_modnames(distname):
+#     # TODO: nice way to switch between a module's import name and it's distribution name
+#     # References:
+#     # https://stackoverflow.com/questions/49764802/get-module-name-programmatically-with-only-pypi-package-name/49764960#49764960
+#     import distlib.database
+#     distlib.database.DistributionPath().get_distribution(distname)
+#     # import importlib.metadata
+#     # importlib.metadata.metadata(distname)
+#     # importlib.util.find_spec(modname)
+#     # import simdjson
+#     # import pkg_resources
+#     # pkg_resources.get_distribution('pysimdjson')
+
+
+def available_json_impls():
+    """
+    Return a dictionary of information about each json implementation
+
+    Example:
+        >>> from json_benchmarks.libraries import *  # NOQA
+        >>> json_impls = available_json_impls()
+        >>> print('json_impls = {}'.format(ub.repr2(json_impls, nl=1)))
+    """
+    import importlib
+    known_libinfo = KNOWN_LIBRARIES
+    json_impls = {}
+    for libinfo in known_libinfo:
+        modname = libinfo['modname']
+        distname = libinfo['distname']
+        try:
+            module = importlib.import_module(modname)
+        except ImportError:
+            pass
+        else:
+            import pkg_resources
+            mod_version = getattr(module, '__version__', None)
+            if distname == '<stdlib>':
+                pkg_version = mod_version
+            else:
+                pkg_version = pkg_resources.get_distribution(distname).version
+            if mod_version is not None:
+                assert mod_version == pkg_version
+            version = pkg_version
+            json_impls[modname] = {
+                "module": module,
+                "modname": modname,
+                "distname": distname,
+                "version": version,
+            }
+    return json_impls
diff --git a/json_benchmarks/measures.py b/json_benchmarks/measures.py
new file mode 100644
index 0000000..1878c62
--- /dev/null
+++ b/json_benchmarks/measures.py
@@ -0,0 +1,126 @@
+"""
+The definitions of the measurements we want to take
+"""
+import scriptconfig as scfg
+import ubelt as ub
+import json
+from json_benchmarks import libraries
+
+
+class MeasurementConfig(scfg.Config):
+    default = {
+        "disable": scfg.Value(
+            [],
+            choices=libraries.KNOWN_MODNAMES,
+            help=ub.paragraph(
+                """
+                Remove specified libraries from the benchmarks
+                """
+            ),
+        ),
+        "factor": scfg.Value(
+            1.0,
+            help=ub.paragraph(
+                """
+                Specify as a fraction to speed up benchmarks for development /
+                testing
+                """
+            ),
+        ),
+        "cache_dir": scfg.Value(
+            None,
+            help=ub.paragraph(
+                """
+                Location for benchmark cache.
+                Defaults to $XDG_CACHE/ujson/benchmark_results/
+                """
+            ),
+        ),
+    }
+
+    def normalize(self):
+        dpath = self["cache_dir"]
+        if dpath is None:
+            dpath = ub.Path.appdir("ujson/benchmark_results")
+        dpath = ub.Path(dpath)
+        self["cache_dir"] = dpath
+
+
+def benchmark_json():
+    from json_benchmarks import benchmarker
+    from json_benchmarks import datagen
+    from json_benchmarks import libraries
+
+    json_impls = libraries.available_json_impls()
+    data_lut = datagen.json_test_data_generators()
+
+    # These are the parameters that we benchmark over
+    common_basis = {
+        "impl": list(json_impls.keys()),
+        "func": ["dumps", "loads"],
+    }
+    sized_basis = {
+        "input": [
+            "Array with doubles",
+            "Array with UTF-8 strings",
+            # 'Medium complex object',
+            "Array with True values",
+            "Array of Dict[str, int]",
+            # 'Dict of List[Dict[str, int]]',
+            # 'Complex object'
+        ],
+        "size": [1, 2, 4, 8, 16, 32, 128, 256, 512, 1024, 2048, 4096, 8192, 12288],
+    }
+    predefined_basis = {
+        "input": ["Complex object"],
+        "size": [None],
+    }
+
+    basis = [
+        ub.dict_union(common_basis, predefined_basis),
+        ub.dict_union(common_basis, sized_basis),
+    ]
+
+    # The Benchmarker class is a new experimental API around timerit to
+    # abstract away the details of timing a process over a grid of parameters,
+    # serializing the results, and aggregating results from disparate runs.
+    benchmark = benchmarker.Benchmarker(
+        name="bench_json",
+        num=1000,
+        bestof=100,
+        verbose=3,
+        basis=basis,
+    )
+
+    def is_blocked(params):
+        if params["input"] == "Complex object" and params["impl"] == "orjson":
+            return True
+
+    # For each variation of your experiment, create a row.
+    for params in benchmark.iter_params():
+        if is_blocked(params):
+            continue
+        # Make any modifications you need to compute input kwargs for each
+        # method here.
+        impl_info = json_impls[params["impl"]]
+        params["impl_version"] = impl_info["version"]
+        module = impl_info["module"]
+        if params["func"] == "dumps":
+            method = module.dumps
+            data = data_lut[params["input"]](params["size"])
+        elif params["func"] == "loads":
+            method = module.loads
+            to_encode = data_lut[params["input"]](params["size"])
+            data = json.dumps(to_encode)
+        # Timerit will run some user-specified number of loops.
+        # and compute time stats with similar methodology to timeit
+        for timer in benchmark.measure():
+            # Put any setup logic you dont want to time here.
+            # ...
+            with timer:
+                # Put the logic you want to time here
+                method(data)
+
+    dpath = ub.Path.appdir("ujson/benchmark_results").ensuredir()
+    result_fpath = benchmark.dump_in_dpath(dpath)
+    return result_fpath

From 2b2aedb89f1750d4f3901a07dd212927a318b32d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 29 May 2022 17:27:13 +0000
Subject: [PATCH 16/25] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 json_benchmarks/analysis.py                |  6 +-
 json_benchmarks/benchmarker/__init__.py    | 83 ++++++++++++++--------
 json_benchmarks/benchmarker/benchmarker.py |  3 +-
 json_benchmarks/core.py                    |  4 +-
 json_benchmarks/libraries.py               | 24 ++++---
 json_benchmarks/measures.py                |  8 +--
 6 files changed, 79 insertions(+), 49 deletions(-)

diff --git a/json_benchmarks/analysis.py b/json_benchmarks/analysis.py
index acda85e..2c9e923 100644
--- a/json_benchmarks/analysis.py
+++ b/json_benchmarks/analysis.py
@@ -27,10 +27,11 @@ class AnalysisConfig(scfg.Config):
 
 
 def analyze_results(result_fpaths):
-    from json_benchmarks.benchmarker import util_stats
-    from json_benchmarks import benchmarker
     import json
 
+    from json_benchmarks import benchmarker
+    from json_benchmarks.benchmarker import util_stats
+
     results = []
     for fpath in ub.ProgIter(result_fpaths, desc="load results"):
         data = json.loads(fpath.read_text())
@@ -96,6 +97,7 @@ def analyze_results(result_fpaths):
         "size": [],
     }
     import kwplot
+
     kwplot.autosns()
     self = analysis  # NOQA
 
diff --git a/json_benchmarks/benchmarker/__init__.py b/json_benchmarks/benchmarker/__init__.py
index aa42063..27d133a 100644
--- a/json_benchmarks/benchmarker/__init__.py
+++ b/json_benchmarks/benchmarker/__init__.py
@@ -9,33 +9,60 @@ mkinit ~/code/ultrajson/json_benchmarks/benchmarker/__init__.py -w
 
 __version__ = "0.1.0"
 
-from json_benchmarks.benchmarker import benchmarker
-from json_benchmarks.benchmarker import process_context
-from json_benchmarks.benchmarker import result_analysis
-from json_benchmarks.benchmarker import util_json
-from json_benchmarks.benchmarker import util_stats
-from json_benchmarks.benchmarker import visualize
-
-from json_benchmarks.benchmarker.benchmarker import (Benchmarker,
-                                                     BenchmarkerConfig,
-                                                     BenchmarkerResult,)
-from json_benchmarks.benchmarker.process_context import (ProcessContext,)
+from json_benchmarks.benchmarker import (
+    benchmarker,
+    process_context,
+    result_analysis,
+    util_json,
+    util_stats,
+    visualize,
+)
+from json_benchmarks.benchmarker.benchmarker import (
+    Benchmarker,
+    BenchmarkerConfig,
+    BenchmarkerResult,
+)
+from json_benchmarks.benchmarker.process_context import ProcessContext
 from json_benchmarks.benchmarker.result_analysis import (
-    DEFAULT_METRIC_TO_OBJECTIVE, Result, ResultAnalysis, SkillTracker,)
-from json_benchmarks.benchmarker.util_json import (ensure_json_serializable,
-                                                   find_json_unserializable,
-                                                   indexable_allclose,)
-from json_benchmarks.benchmarker.util_stats import (aggregate_stats,
-                                                    combine_stats,
-                                                    combine_stats_arrs,
-                                                    stats_dict,)
-from json_benchmarks.benchmarker.visualize import (benchmark_analysis,)
+    DEFAULT_METRIC_TO_OBJECTIVE,
+    Result,
+    ResultAnalysis,
+    SkillTracker,
+)
+from json_benchmarks.benchmarker.util_json import (
+    ensure_json_serializable,
+    find_json_unserializable,
+    indexable_allclose,
+)
+from json_benchmarks.benchmarker.util_stats import (
+    aggregate_stats,
+    combine_stats,
+    combine_stats_arrs,
+    stats_dict,
+)
+from json_benchmarks.benchmarker.visualize import benchmark_analysis
 
-__all__ = ['Benchmarker', 'BenchmarkerConfig', 'BenchmarkerResult',
-           'DEFAULT_METRIC_TO_OBJECTIVE', 'ProcessContext', 'Result',
-           'ResultAnalysis', 'SkillTracker', 'aggregate_stats',
-           'benchmark_analysis', 'benchmarker', 'combine_stats',
-           'combine_stats_arrs', 'ensure_json_serializable',
-           'find_json_unserializable', 'indexable_allclose', 'process_context',
-           'result_analysis', 'stats_dict', 'util_json', 'util_stats',
-           'visualize']
+__all__ = [
+    "Benchmarker",
+    "BenchmarkerConfig",
+    "BenchmarkerResult",
+    "DEFAULT_METRIC_TO_OBJECTIVE",
+    "ProcessContext",
+    "Result",
+    "ResultAnalysis",
+    "SkillTracker",
+    "aggregate_stats",
+    "benchmark_analysis",
+    "benchmarker",
+    "combine_stats",
+    "combine_stats_arrs",
+    "ensure_json_serializable",
+    "find_json_unserializable",
+    "indexable_allclose",
+    "process_context",
+    "result_analysis",
+    "stats_dict",
+    "util_json",
+    "util_stats",
+    "visualize",
+]
diff --git a/json_benchmarks/benchmarker/benchmarker.py b/json_benchmarks/benchmarker/benchmarker.py
index ac53372..21e6234 100644
--- a/json_benchmarks/benchmarker/benchmarker.py
+++ b/json_benchmarks/benchmarker/benchmarker.py
@@ -171,9 +171,10 @@ class Benchmarker:
 
 
 def _test_demo():
+    import numpy as np
+
     from json_benchmarks.benchmarker import BenchmarkerResult, result_analysis
     from json_benchmarks.benchmarker.benchmarker import Benchmarker
-    import numpy as np
 
     impl_lut = {
         "numpy": np.sum,
diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py
index d6103be..c43a474 100644
--- a/json_benchmarks/core.py
+++ b/json_benchmarks/core.py
@@ -4,8 +4,7 @@ Main definition of the benchmarks
 import scriptconfig as scfg
 import ubelt as ub
 
-from json_benchmarks import measures
-from json_benchmarks import analysis
+from json_benchmarks import analysis, measures
 
 
 class CoreConfig(scfg.Config):
@@ -33,7 +32,6 @@ class CoreConfig(scfg.Config):
                 """
             ),
         ),
-
         "cache_dir": scfg.Value(
             None,
             help=ub.paragraph(
diff --git a/json_benchmarks/libraries.py b/json_benchmarks/libraries.py
index d7a8550..088368c 100644
--- a/json_benchmarks/libraries.py
+++ b/json_benchmarks/libraries.py
@@ -3,15 +3,15 @@ Define the json libraries we are considering
 """
 
 KNOWN_LIBRARIES = [
-    {'modname': "ujson", 'distname': 'ujson'},
-    {'modname': "nujson", 'distname': 'nujson'},
-    {'modname': "orjson", 'distname': 'orjson'},
-    {'modname': "simplejson", 'distname': 'simplejson'},
-    {'modname': "json", 'distname': "<stdlib>"},
-    {'modname': "simdjson", 'distname': 'pysimdjson'},
+    {"modname": "ujson", "distname": "ujson"},
+    {"modname": "nujson", "distname": "nujson"},
+    {"modname": "orjson", "distname": "orjson"},
+    {"modname": "simplejson", "distname": "simplejson"},
+    {"modname": "json", "distname": "<stdlib>"},
+    {"modname": "simdjson", "distname": "pysimdjson"},
 ]
 
-KNOWN_MODNAMES = [info['modname'] for info in KNOWN_LIBRARIES]
+KNOWN_MODNAMES = [info["modname"] for info in KNOWN_LIBRARIES]
 
 
 # TODO:
@@ -39,19 +39,21 @@ def available_json_impls():
         >>> print('json_impls = {}'.format(ub.repr2(json_impls, nl=1)))
     """
     import importlib
+
     known_libinfo = KNOWN_LIBRARIES
     json_impls = {}
     for libinfo in known_libinfo:
-        modname = libinfo['modname']
-        distname = libinfo['distname']
+        modname = libinfo["modname"]
+        distname = libinfo["distname"]
         try:
             module = importlib.import_module(modname)
         except ImportError:
             pass
         else:
             import pkg_resources
-            mod_version = getattr(module, '__version__', None)
-            if distname == '<stdlib>':
+
+            mod_version = getattr(module, "__version__", None)
+            if distname == "<stdlib>":
                 pkg_version = mod_version
             else:
                 pkg_version = pkg_resources.get_distribution(distname).version
diff --git a/json_benchmarks/measures.py b/json_benchmarks/measures.py
index 1878c62..8e768ce 100644
--- a/json_benchmarks/measures.py
+++ b/json_benchmarks/measures.py
@@ -1,9 +1,11 @@
 """
 The definitions of the measurements we want to take
 """
+import json
+
 import scriptconfig as scfg
 import ubelt as ub
-import json
+
 from json_benchmarks import libraries
 
 
@@ -47,9 +49,7 @@ class MeasurementConfig(scfg.Config):
 
 
 def benchmark_json():
-    from json_benchmarks import benchmarker
-    from json_benchmarks import datagen
-    from json_benchmarks import libraries
+    from json_benchmarks import benchmarker, datagen, libraries
 
     json_impls = libraries.available_json_impls()
     data_lut = datagen.json_test_data_generators()

From b0bc25ab3c02d7f652d8e06abd253dee0ec1266f Mon Sep 17 00:00:00 2001
From: joncrall <jon.crall@kitware.com>
Date: Sun, 29 May 2022 18:56:24 -0400
Subject: [PATCH 17/25] Add simd libraries

---
 json_benchmarks/core.py      |  6 +++---
 json_benchmarks/libraries.py | 36 ++++++++++++++++++++++++++++++++----
 json_benchmarks/measures.py  | 21 ++++++++++++---------
 3 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py
index c43a474..af11174 100644
--- a/json_benchmarks/core.py
+++ b/json_benchmarks/core.py
@@ -72,9 +72,9 @@ def main(cmdline=True, **kwargs):
         print(f"result_fpath = {result_fpath!r}")
         result_fpaths = [result_fpath]
 
-    agg = config["mode"] not in {"single"}
-    if agg:
-        result_fpaths = list(dpath.glob("benchmarks*.json"))
+    # agg = config["mode"] not in {"single"}
+    # if agg:
+    #     result_fpaths = list(dpath.glob("benchmarks*.json"))
 
     analyze = config["mode"] in {"all", "single", "analyze"}
     if analyze:
diff --git a/json_benchmarks/libraries.py b/json_benchmarks/libraries.py
index 088368c..f828e3f 100644
--- a/json_benchmarks/libraries.py
+++ b/json_benchmarks/libraries.py
@@ -9,6 +9,8 @@ KNOWN_LIBRARIES = [
     {"modname": "simplejson", "distname": "simplejson"},
     {"modname": "json", "distname": "<stdlib>"},
     {"modname": "simdjson", "distname": "pysimdjson"},
+    {"modname": "cysimdjson", "distname": "cysimdjson"},
+    {"modname": "libpy_simdjson", "distname": "libpy-simdjson"},
 ]
 
 KNOWN_MODNAMES = [info["modname"] for info in KNOWN_LIBRARIES]
@@ -29,6 +31,29 @@ KNOWN_MODNAMES = [info["modname"] for info in KNOWN_LIBRARIES]
 #     # pkg_resources.get_distribution('pysimdjson')
 
 
+class Compatability:
+    """
+    Expose a common API for all tested implmentations
+    """
+
+    @staticmethod
+    def lut_dumps(module):
+        if module.__name__ == 'cysimdjson':
+            return None
+        elif module.__name__ == 'pysimdjson':
+            return None
+        else:
+            return getattr(module, 'dumps', None)
+
+    @staticmethod
+    def lut_loads(module):
+        if module.__name__ == 'cysimdjson':
+            parser = module.JSONParser()
+            return parser.loads
+        else:
+            return getattr(module, 'loads', None)
+
+
 def available_json_impls():
     """
     Return a dictionary of information about each json implementation
@@ -39,7 +64,7 @@ def available_json_impls():
         >>> print('json_impls = {}'.format(ub.repr2(json_impls, nl=1)))
     """
     import importlib
-
+    import pkg_resources
     known_libinfo = KNOWN_LIBRARIES
     json_impls = {}
     for libinfo in known_libinfo:
@@ -50,8 +75,6 @@ def available_json_impls():
         except ImportError:
             pass
         else:
-            import pkg_resources
-
             mod_version = getattr(module, "__version__", None)
             if distname == "<stdlib>":
                 pkg_version = mod_version
@@ -60,10 +83,15 @@ def available_json_impls():
             if mod_version is not None:
                 assert mod_version == pkg_version
             version = pkg_version
-            json_impls[modname] = {
+            dumps = Compatability.lut_dumps(module)
+            loads = Compatability.lut_loads(module)
+            impl_info = {
                 "module": module,
                 "modname": modname,
                 "distname": distname,
                 "version": version,
+                "dumps": dumps,
+                "loads": loads,
             }
+            json_impls[modname] = impl_info
     return json_impls
diff --git a/json_benchmarks/measures.py b/json_benchmarks/measures.py
index 8e768ce..ca209b1 100644
--- a/json_benchmarks/measures.py
+++ b/json_benchmarks/measures.py
@@ -69,7 +69,7 @@ def benchmark_json():
             # 'Dict of List[Dict[str, int]]',
             # 'Complex object'
         ],
-        "size": [1, 2, 4, 8, 16, 32, 128, 256, 512, 1024, 2048, 4096, 8192, 12288],
+        "size": [1, 2, 4, 8, 16, 32, 128, 256, 512, 1024, 2048, 4096, 8192],
     }
     predefined_basis = {
         "input": ["Complex object"],
@@ -93,8 +93,10 @@ def benchmark_json():
     )
 
     def is_blocked(params):
-        if params["input"] == "Complex object" and params["impl"] == "orjson":
-            return True
+        if params["input"] == "Complex object":
+            # Some libraries can't handle the complex object
+            if params["impl"] in {"orjson", "libpy_simdjson"}:
+                return True
 
     # For each variation of your experiment, create a row.
     for params in benchmark.iter_params():
@@ -104,14 +106,15 @@ def benchmark_json():
         # method here.
         impl_info = json_impls[params["impl"]]
         params["impl_version"] = impl_info["version"]
-        module = impl_info["module"]
+        method = impl_info[params["func"]]
+        if method is None:
+            # Not all libraries implement all methods
+            continue
+        py_data = data_lut[params["input"]](params["size"])
         if params["func"] == "dumps":
-            method = module.dumps
-            data = data_lut[params["input"]](params["size"])
+            data = py_data
         elif params["func"] == "loads":
-            method = module.loads
-            to_encode = data_lut[params["input"]](params["size"])
-            data = json.dumps(to_encode)
+            data = json.dumps(py_data)
         # Timerit will run some user-specified number of loops.
         # and compute time stats with similar methodology to timeit
         for timer in benchmark.measure():

From 80d096015e35a747e9756c42ae16fb324fbb543f Mon Sep 17 00:00:00 2001
From: joncrall <jon.crall@kitware.com>
Date: Sun, 29 May 2022 19:07:46 -0400
Subject: [PATCH 18/25] Fix cysimdjson

---
 json_benchmarks/analysis.py |  1 -
 json_benchmarks/core.py     |  7 ++++---
 json_benchmarks/measures.py | 19 +++++++++++--------
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/json_benchmarks/analysis.py b/json_benchmarks/analysis.py
index 2c9e923..35cb499 100644
--- a/json_benchmarks/analysis.py
+++ b/json_benchmarks/analysis.py
@@ -28,7 +28,6 @@ class AnalysisConfig(scfg.Config):
 
 def analyze_results(result_fpaths):
     import json
-
     from json_benchmarks import benchmarker
     from json_benchmarks.benchmarker import util_stats
 
diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py
index af11174..b840c69 100644
--- a/json_benchmarks/core.py
+++ b/json_benchmarks/core.py
@@ -65,6 +65,7 @@ def main(cmdline=True, **kwargs):
     """
     config = CoreConfig(cmdline=cmdline, data=kwargs)
     dpath = config["cache_dir"]
+    print(f'dpath={dpath}')
 
     run = config["mode"] in {"all", "single", "run"}
     if run:
@@ -72,9 +73,9 @@ def main(cmdline=True, **kwargs):
         print(f"result_fpath = {result_fpath!r}")
         result_fpaths = [result_fpath]
 
-    # agg = config["mode"] not in {"single"}
-    # if agg:
-    #     result_fpaths = list(dpath.glob("benchmarks*.json"))
+    agg = config["mode"] not in {"single"}
+    if agg:
+        result_fpaths = list(dpath.glob("benchmarks*.json"))
 
     analyze = config["mode"] in {"all", "single", "analyze"}
     if analyze:
diff --git a/json_benchmarks/measures.py b/json_benchmarks/measures.py
index ca209b1..7913cc6 100644
--- a/json_benchmarks/measures.py
+++ b/json_benchmarks/measures.py
@@ -86,8 +86,8 @@ def benchmark_json():
     # serializing the results, and aggregating results from disparate runs.
     benchmark = benchmarker.Benchmarker(
         name="bench_json",
-        num=1000,
-        bestof=100,
+        num=100,
+        bestof=10,
         verbose=3,
         basis=basis,
     )
@@ -117,12 +117,15 @@ def benchmark_json():
             data = json.dumps(py_data)
         # Timerit will run some user-specified number of loops.
         # and compute time stats with similar methodology to timeit
-        for timer in benchmark.measure():
-            # Put any setup logic you dont want to time here.
-            # ...
-            with timer:
-                # Put the logic you want to time here
-                method(data)
+        try:
+            for timer in benchmark.measure():
+                # Put any setup logic you dont want to time here.
+                # ...
+                with timer:
+                    # Put the logic you want to time here
+                    method(data)
+        except Exception as ex:
+            print(f'Failed to time: ex={ex}. Skipping')
 
     dpath = ub.Path.appdir("ujson/benchmark_results").ensuredir()
     result_fpath = benchmark.dump_in_dpath(dpath)

From 7dbb203450810217c6f8790c01fb54d73d5305b7 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 29 May 2022 23:08:02 +0000
Subject: [PATCH 19/25] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 json_benchmarks/analysis.py  |  1 +
 json_benchmarks/core.py      |  2 +-
 json_benchmarks/libraries.py | 12 +++++++-----
 json_benchmarks/measures.py  |  2 +-
 4 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/json_benchmarks/analysis.py b/json_benchmarks/analysis.py
index 35cb499..2c9e923 100644
--- a/json_benchmarks/analysis.py
+++ b/json_benchmarks/analysis.py
@@ -28,6 +28,7 @@ class AnalysisConfig(scfg.Config):
 
 def analyze_results(result_fpaths):
     import json
+
     from json_benchmarks import benchmarker
     from json_benchmarks.benchmarker import util_stats
 
diff --git a/json_benchmarks/core.py b/json_benchmarks/core.py
index b840c69..9b760c8 100644
--- a/json_benchmarks/core.py
+++ b/json_benchmarks/core.py
@@ -65,7 +65,7 @@ def main(cmdline=True, **kwargs):
     """
     config = CoreConfig(cmdline=cmdline, data=kwargs)
     dpath = config["cache_dir"]
-    print(f'dpath={dpath}')
+    print(f"dpath={dpath}")
 
     run = config["mode"] in {"all", "single", "run"}
     if run:
diff --git a/json_benchmarks/libraries.py b/json_benchmarks/libraries.py
index f828e3f..027f7e9 100644
--- a/json_benchmarks/libraries.py
+++ b/json_benchmarks/libraries.py
@@ -38,20 +38,20 @@ class Compatability:
 
     @staticmethod
     def lut_dumps(module):
-        if module.__name__ == 'cysimdjson':
+        if module.__name__ == "cysimdjson":
             return None
-        elif module.__name__ == 'pysimdjson':
+        elif module.__name__ == "pysimdjson":
             return None
         else:
-            return getattr(module, 'dumps', None)
+            return getattr(module, "dumps", None)
 
     @staticmethod
     def lut_loads(module):
-        if module.__name__ == 'cysimdjson':
+        if module.__name__ == "cysimdjson":
             parser = module.JSONParser()
             return parser.loads
         else:
-            return getattr(module, 'loads', None)
+            return getattr(module, "loads", None)
 
 
 def available_json_impls():
@@ -64,7 +64,9 @@ def available_json_impls():
         >>> print('json_impls = {}'.format(ub.repr2(json_impls, nl=1)))
     """
     import importlib
+
     import pkg_resources
+
     known_libinfo = KNOWN_LIBRARIES
     json_impls = {}
     for libinfo in known_libinfo:
diff --git a/json_benchmarks/measures.py b/json_benchmarks/measures.py
index 7913cc6..c44a461 100644
--- a/json_benchmarks/measures.py
+++ b/json_benchmarks/measures.py
@@ -125,7 +125,7 @@ def benchmark_json():
                     # Put the logic you want to time here
                     method(data)
         except Exception as ex:
-            print(f'Failed to time: ex={ex}. Skipping')
+            print(f"Failed to time: ex={ex}. Skipping")
 
     dpath = ub.Path.appdir("ujson/benchmark_results").ensuredir()
     result_fpath = benchmark.dump_in_dpath(dpath)

From 9358a546e1f1668df88bb394e87ffcf6e1de36cc Mon Sep 17 00:00:00 2001
From: joncrall <jon.crall@kitware.com>
Date: Sun, 29 May 2022 19:08:37 -0400
Subject: [PATCH 20/25] name fix

---
 json_benchmarks/libraries.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/json_benchmarks/libraries.py b/json_benchmarks/libraries.py
index f828e3f..bcf60d7 100644
--- a/json_benchmarks/libraries.py
+++ b/json_benchmarks/libraries.py
@@ -40,7 +40,7 @@ class Compatability:
     def lut_dumps(module):
         if module.__name__ == 'cysimdjson':
             return None
-        elif module.__name__ == 'pysimdjson':
+        elif module.__name__ == 'simdjson':
             return None
         else:
             return getattr(module, 'dumps', None)

From 9196d05d0b508691972c069df04a8379a8addf1a Mon Sep 17 00:00:00 2001
From: joncrall <jon.crall@kitware.com>
Date: Sun, 29 May 2022 19:10:51 -0400
Subject: [PATCH 21/25] wip

---
 json_benchmarks/libraries.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/json_benchmarks/libraries.py b/json_benchmarks/libraries.py
index aff9625..ee4b74d 100644
--- a/json_benchmarks/libraries.py
+++ b/json_benchmarks/libraries.py
@@ -40,11 +40,7 @@ class Compatability:
     def lut_dumps(module):
         if module.__name__ == "cysimdjson":
             return None
-<<<<<<< HEAD
         elif module.__name__ == 'simdjson':
-=======
-        elif module.__name__ == "pysimdjson":
->>>>>>> 7dbb203450810217c6f8790c01fb54d73d5305b7
             return None
         else:
             return getattr(module, "dumps", None)

From 2f3070d74f9125dcbf1f1a1abac22809526b165f Mon Sep 17 00:00:00 2001
From: joncrall <jon.crall@kitware.com>
Date: Sun, 29 May 2022 19:11:03 -0400
Subject: [PATCH 22/25] wip

---
 json_benchmarks/libraries.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/json_benchmarks/libraries.py b/json_benchmarks/libraries.py
index ee4b74d..cb0efb7 100644
--- a/json_benchmarks/libraries.py
+++ b/json_benchmarks/libraries.py
@@ -40,7 +40,7 @@ class Compatability:
     def lut_dumps(module):
         if module.__name__ == "cysimdjson":
             return None
-        elif module.__name__ == 'simdjson':
+        elif module.__name__ == "simdjson":
             return None
         else:
             return getattr(module, "dumps", None)

From ac5b1437120e18cf9fbc96476447d2951a6f6726 Mon Sep 17 00:00:00 2001
From: joncrall <jon.crall@kitware.com>
Date: Mon, 30 May 2022 21:29:51 -0400
Subject: [PATCH 23/25] stats for fix-encode-surrogates

---
 json_benchmarks/analysis.py  |  6 +++---
 json_benchmarks/libraries.py | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/json_benchmarks/analysis.py b/json_benchmarks/analysis.py
index 2c9e923..d700e87 100644
--- a/json_benchmarks/analysis.py
+++ b/json_benchmarks/analysis.py
@@ -47,7 +47,7 @@ def analyze_results(result_fpaths):
     analysis = benchmarker.result_analysis.ResultAnalysis(
         results,
         metrics=[metric_key],
-        params=["impl"],
+        params=["impl", "impl_version"],
         metric_objectives={
             "min_time": "min",
             "mean_time": "min",
@@ -57,14 +57,14 @@ def analyze_results(result_fpaths):
     analysis.analysis()
 
     table = analysis.table
-    stats_table = util_stats.aggregate_stats(table, suffix="_time", group_keys=["name"])
+    stats_table = util_stats.aggregate_stats(table, suffix="_time", group_keys=["name", "impl_version"])
 
     single_size = stats_table[
         (stats_table["size"] == 256) | stats_table["size"].isnull()
     ]
     # single_size_combo = aggregate_stats(single_size, None)
     single_size_combo = util_stats.aggregate_stats(
-        single_size, suffix="_time", group_keys=["name"]
+        single_size, suffix="_time", group_keys=["name", "impl_version"]
     )
 
     param_group = ["impl", "impl_version"]
diff --git a/json_benchmarks/libraries.py b/json_benchmarks/libraries.py
index cb0efb7..4fe04a8 100644
--- a/json_benchmarks/libraries.py
+++ b/json_benchmarks/libraries.py
@@ -4,13 +4,13 @@ Define the json libraries we are considering
 
 KNOWN_LIBRARIES = [
     {"modname": "ujson", "distname": "ujson"},
-    {"modname": "nujson", "distname": "nujson"},
-    {"modname": "orjson", "distname": "orjson"},
-    {"modname": "simplejson", "distname": "simplejson"},
+    # {"modname": "nujson", "distname": "nujson"},
+    # {"modname": "orjson", "distname": "orjson"},
+    # {"modname": "simplejson", "distname": "simplejson"},
     {"modname": "json", "distname": "<stdlib>"},
-    {"modname": "simdjson", "distname": "pysimdjson"},
-    {"modname": "cysimdjson", "distname": "cysimdjson"},
-    {"modname": "libpy_simdjson", "distname": "libpy-simdjson"},
+    # {"modname": "simdjson", "distname": "pysimdjson"},
+    # {"modname": "cysimdjson", "distname": "cysimdjson"},
+    # {"modname": "libpy_simdjson", "distname": "libpy-simdjson"},
 ]
 
 KNOWN_MODNAMES = [info["modname"] for info in KNOWN_LIBRARIES]

From fd951a31f177ce863c9ceff357213183f340ecb4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 31 May 2022 01:30:42 +0000
Subject: [PATCH 24/25] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 json_benchmarks/analysis.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/json_benchmarks/analysis.py b/json_benchmarks/analysis.py
index d700e87..b3f8778 100644
--- a/json_benchmarks/analysis.py
+++ b/json_benchmarks/analysis.py
@@ -57,7 +57,9 @@ def analyze_results(result_fpaths):
     analysis.analysis()
 
     table = analysis.table
-    stats_table = util_stats.aggregate_stats(table, suffix="_time", group_keys=["name", "impl_version"])
+    stats_table = util_stats.aggregate_stats(
+        table, suffix="_time", group_keys=["name", "impl_version"]
+    )
 
     single_size = stats_table[
         (stats_table["size"] == 256) | stats_table["size"].isnull()

From a580404b46d387b5c86d8690d16b3f6f833455f8 Mon Sep 17 00:00:00 2001
From: joncrall <erotemic@gmail.com>
Date: Wed, 17 Jan 2024 12:12:43 -0500
Subject: [PATCH 25/25] Fixed bug

---
 json_benchmarks/benchmarker/result_analysis.py | 11 +++++++++--
 json_benchmarks/benchmarker/util_stats.py      |  6 +++---
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/json_benchmarks/benchmarker/result_analysis.py b/json_benchmarks/benchmarker/result_analysis.py
index 108f3de..39bc262 100644
--- a/json_benchmarks/benchmarker/result_analysis.py
+++ b/json_benchmarks/benchmarker/result_analysis.py
@@ -911,13 +911,20 @@ class ResultAnalysis(ub.NiceRepr):
             )
             from json_benchmarks.benchmarker.util_stats import aggregate_stats
 
-            facet_data_groups = dict(list(facet.data.groupby(facet._col_var)))
+            # print(f'facet._col_var={facet._col_var}')
+            if facet._col_var is not None:
+                facet_data_groups = dict(list(facet.data.groupby(facet._col_var)))
+            else:
+                facet_data_groups = None
             # facet_data_group_iter = iter(facet_data_groups.keys())
 
             for ax in facet.axes.ravel():
                 col_key = ax.get_title().split("=", 1)[-1].strip()
                 # col_key = next(facet_data_group_iter)
-                col_data = facet_data_groups[col_key]
+                if facet_data_groups is not None:
+                    col_data = facet_data_groups[col_key]
+                else:
+                    col_data = facet.data
                 col_data["mean_time"]
                 col_data["std_time"]
                 xlabel = plot_kws["x"]
diff --git a/json_benchmarks/benchmarker/util_stats.py b/json_benchmarks/benchmarker/util_stats.py
index 2eaa32c..38cf2e0 100644
--- a/json_benchmarks/benchmarker/util_stats.py
+++ b/json_benchmarks/benchmarker/util_stats.py
@@ -190,8 +190,8 @@ def combine_stats(s1, s2):
         >>>     assert np.allclose(compare.raw, compare.combo)
 
     References:
-        https://stackoverflow.com/questions/7753002/adding-combining-standard-deviations
-        https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups
+        .. [SO7753002] https://stackoverflow.com/questions/7753002/adding-combining-standard-deviations
+        .. [SO2971315] https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups
     """
     stats = [s1, s2]
     data = {
@@ -201,7 +201,7 @@ def combine_stats(s1, s2):
         "min": np.array([s["min"] for s in stats]),
         "max": np.array([s["max"] for s in stats]),
     }
-    combine_stats_arrs(data)
+    return combine_stats_arrs(data)
 
 
 def combine_stats_arrs(data):