Merge a580404b46 into 04daf02b94

2024-05-19 13:26:25 +02:00 · 2024-04-06 07:07:55 +03:00 · 2024-04-06 07:07:55 +03:00 · 2566b2e094
parent 04daf02b94 a580404b46
commit 2566b2e094
14 changed files with 2664 additions and 0 deletions
--- a/json_benchmarks/init.py
+++ b/json_benchmarks/init.py
--- a/json_benchmarks/main.py
+++ b/json_benchmarks/main.py
@ -0,0 +1,8 @@
+if __name__ == "__main__":
+    """
+    CommandLine:
+        python -m json_benchmarks
+    """
+    from json_benchmarks import core
+
+    core.main()
--- a/json_benchmarks/analysis.py
+++ b/json_benchmarks/analysis.py
@ -0,0 +1,116 @@
+"""
+The analysis of the measurements
+"""
+import scriptconfig as scfg
+import ubelt as ub
+
+
+class AnalysisConfig(scfg.Config):
+    default = {
+        "cache_dir": scfg.Value(
+            None,
+            help=ub.paragraph(
+                """
+                Location for benchmark cache.
+                Defaults to $XDG_CACHE/ujson/benchmark_results/
+                """
+            ),
+        ),
+    }
+
+    def normalize(self):
+        dpath = self["cache_dir"]
+        if dpath is None:
+            dpath = ub.Path.appdir("ujson/benchmark_results")
+        dpath = ub.Path(dpath)
+        self["cache_dir"] = dpath
+
+
+def analyze_results(result_fpaths):
+    import json
+
+    from json_benchmarks import benchmarker
+    from json_benchmarks.benchmarker import util_stats
+
+    results = []
+    for fpath in ub.ProgIter(result_fpaths, desc="load results"):
+        data = json.loads(fpath.read_text())
+        for row in data["rows"]:
+            result = benchmarker.BenchmarkerResult.load(fpath)
+            results.extend(result.to_result_list())
+
+    RECORD_ALL = 0
+    metric_key = "time" if RECORD_ALL else "mean_time"
+
+    # results = benchmark.result.to_result_list()
+
+    analysis = benchmarker.result_analysis.ResultAnalysis(
+        results,
+        metrics=[metric_key],
+        params=["impl", "impl_version"],
+        metric_objectives={
+            "min_time": "min",
+            "mean_time": "min",
+            "time": "min",
+        },
+    )
+    analysis.analysis()
+
+    table = analysis.table
+    stats_table = util_stats.aggregate_stats(
+        table, suffix="_time", group_keys=["name", "impl_version"]
+    )
+
+    single_size = stats_table[
+        (stats_table["size"] == 256) | stats_table["size"].isnull()
+    ]
+    # single_size_combo = aggregate_stats(single_size, None)
+    single_size_combo = util_stats.aggregate_stats(
+        single_size, suffix="_time", group_keys=["name", "impl_version"]
+    )
+
+    param_group = ["impl", "impl_version"]
+    single_size_combo["calls/sec"] = 1 / single_size_combo["mean_time"]
+    # _single_size_combo = single_size_combo.copy()
+    time_piv = single_size_combo.pivot(["input", "func"], param_group, "mean_time")
+
+    hz_piv = 1 / time_piv
+    # hzstr_piv = (1 / time_piv).applymap(lambda x: f"{x:,.02f}")
+    print("Table for size=256")
+    # print(hzstr_piv.to_markdown())
+    print(hz_piv.to_markdown(floatfmt=",.02f"))
+    print("")
+    print("Above metrics are in call/sec, larger is better.")
+
+    speedup_piv = hz_piv / hz_piv["json"].values
+    print(speedup_piv.to_markdown(floatfmt=",.02g"))
+
+    analysis.abalate(param_group)
+    # benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL)
+
+    xlabel = "size"
+    # Set these to empty lists if they are not used
+    group_labels = {
+        "fig": ["input"],
+        "col": ["func"],
+        # "fig": [],
+        # "col": ["func" "input"],
+        "hue": ["impl", "impl_version"],
+        "size": [],
+    }
+    import kwplot
+
+    kwplot.autosns()
+    self = analysis  # NOQA
+
+    data = stats_table
+    plots = analysis.plot(
+        xlabel,
+        metric_key,
+        group_labels,
+        xscale="log",
+        yscale="log",
+        data=data,
+    )
+    plots
+    kwplot.show_if_requested()
--- a/json_benchmarks/benchmarker/init.py
+++ b/json_benchmarks/benchmarker/init.py
@ -0,0 +1,68 @@
+"""
+A helper module for executing, serializing, combining, and comparing benchmarks
+"""
+
+__mkinit__ = """
+# Autogenerate this file
+mkinit ~/code/ultrajson/json_benchmarks/benchmarker/__init__.py -w
+"""
+
+__version__ = "0.1.0"
+
+from json_benchmarks.benchmarker import (
+    benchmarker,
+    process_context,
+    result_analysis,
+    util_json,
+    util_stats,
+    visualize,
+)
+from json_benchmarks.benchmarker.benchmarker import (
+    Benchmarker,
+    BenchmarkerConfig,
+    BenchmarkerResult,
+)
+from json_benchmarks.benchmarker.process_context import ProcessContext
+from json_benchmarks.benchmarker.result_analysis import (
+    DEFAULT_METRIC_TO_OBJECTIVE,
+    Result,
+    ResultAnalysis,
+    SkillTracker,
+)
+from json_benchmarks.benchmarker.util_json import (
+    ensure_json_serializable,
+    find_json_unserializable,
+    indexable_allclose,
+)
+from json_benchmarks.benchmarker.util_stats import (
+    aggregate_stats,
+    combine_stats,
+    combine_stats_arrs,
+    stats_dict,
+)
+from json_benchmarks.benchmarker.visualize import benchmark_analysis
+
+__all__ = [
+    "Benchmarker",
+    "BenchmarkerConfig",
+    "BenchmarkerResult",
+    "DEFAULT_METRIC_TO_OBJECTIVE",
+    "ProcessContext",
+    "Result",
+    "ResultAnalysis",
+    "SkillTracker",
+    "aggregate_stats",
+    "benchmark_analysis",
+    "benchmarker",
+    "combine_stats",
+    "combine_stats_arrs",
+    "ensure_json_serializable",
+    "find_json_unserializable",
+    "indexable_allclose",
+    "process_context",
+    "result_analysis",
+    "stats_dict",
+    "util_json",
+    "util_stats",
+    "visualize",
+]
--- a/json_benchmarks/benchmarker/benchmarker.py
+++ b/json_benchmarks/benchmarker/benchmarker.py
@ -0,0 +1,233 @@
+import json
+from dataclasses import dataclass
+
+import numpy as np
+import timerit
+import ubelt as ub
+
+from json_benchmarks.benchmarker.process_context import ProcessContext
+
+
+@dataclass
+class BenchmarkerConfig:
+    name: str = None
+    num: int = 100
+    bestof: int = 10
+
+
+class BenchmarkerResult:
+    """
+    Serialization for a single benchmark result
+    """
+
+    def __init__(self, context, rows):
+        self.context = context
+        self.rows = rows
+
+    def __json__(self):
+        data = {
+            "type": "benchmark_result",
+            "context": self.context,
+            "rows": self.rows,
+        }
+        return data
+
+    @classmethod
+    def from_json(cls, data):
+        assert data["type"] == "benchmark_result"
+        self = cls(data["context"], data["rows"])
+        return self
+
+    @classmethod
+    def load(cls, fpath):
+        with open(fpath) as file:
+            data = json.load(file)
+        self = cls.from_json(data)
+        return self
+
+    def to_result_list(self):
+        """
+        Returns a list of result objects suitable for ResultAnalysis
+
+        Returns:
+            List[Result]
+        """
+        from json_benchmarks.benchmarker import result_analysis
+
+        results = []
+        for row in self.rows:
+            result = result_analysis.Result(
+                name=row["name"],
+                metrics=row["metrics"],
+                params=row["params"].copy(),
+            )
+            machine = self.context["machine"]
+            assert not ub.dict_isect(result.params, machine)
+            result.params.update(machine)
+            results.append(result)
+        return results
+
+
+class Benchmarker:
+    """
+    Helper to organize the execution and serialization of a benchmark
+
+    Example:
+        >>> import numpy as np
+        >>> impl_lut = {
+        >>>     'numpy': np.sum,
+        >>>     'builtin': sum,
+        >>> }
+        >>> def data_lut(params):
+        >>>     item = 42 if params['dtype'] == 'int' else 42.0
+        >>>     data = [item] * params['size']
+        >>>     return data
+        >>> basis = {
+        >>>     'impl': ['builtin', 'numpy'],
+        >>>     'size': [10, 10000],
+        >>>     'dtype': ['int', 'float'],
+        >>> }
+        >>> self = Benchmarker(name='demo', num=10, bestof=3, basis=basis)
+        >>> for params in self.iter_params():
+        >>>     impl = impl_lut[params['impl']]
+        >>>     data = data_lut(params)
+        >>>     for timer in self.measure():
+        >>>         with timer:
+        >>>             impl(data)
+        >>> print('self.result = {}'.format(ub.repr2(self.result.__json__(), sort=0, nl=2, precision=8)))
+        >>> dpath = ub.Path.appdir('benchmarker/demo').ensuredir()
+        >>> self.dump_in_dpath(dpath)
+    """
+
+    def __init__(self, basis={}, verbose=1, **kwargs):
+        self.basis = basis
+
+        self.config = BenchmarkerConfig(**kwargs)
+
+        self.ti = timerit.Timerit(
+            num=self.config.num,
+            bestof=self.config.bestof,
+            verbose=verbose,
+        )
+        self.context = ProcessContext(name=self.config.name)
+        self.rows = []
+        self.RECORD_ALL = 0
+        self.result = None
+
+    def dump_in_dpath(self, dpath):
+        dpath = ub.Path(dpath)
+        timestamp = self.context.obj["stop_timestamp"]
+        fname = f"benchmarks_{self.config.name}_{timestamp}.json"
+        fpath = dpath / fname
+
+        with open(fpath, "w") as file:
+            json.dump(self.result.__json__(), file)
+        return fpath
+
+    def iter_params(self):
+        self.context.start()
+        if isinstance(self.basis, dict):
+            grid_iter = ub.named_product(self.basis)
+        else:
+            grid_iter = ub.flatten([ub.named_product(b) for b in self.basis])
+
+        for params in grid_iter:
+            self.params = params
+            self.key = ub.repr2(params, compact=1, si=1)
+            yield params
+        obj = self.context.stop()
+        self.result = BenchmarkerResult(obj, self.rows)
+
+    def measure(self):
+        yield from self.ti.reset(self.key)
+
+        rows = self.rows
+        ti = self.ti
+        key = self.key
+        params = self.params
+        times = ti.robust_times()
+        if self.RECORD_ALL:
+            for time in times:
+                metrics = {
+                    "time": time,
+                }
+                row = {
+                    "name": key,
+                    "metrics": metrics,
+                    "params": params,
+                }
+                rows.append(row)
+        else:
+            from json_benchmarks.benchmarker import util_stats
+
+            times = np.array(ti.robust_times())
+            metrics = util_stats.stats_dict(times, "_time")
+            row = {
+                "metrics": metrics,
+                "params": params,
+                "name": key,
+            }
+            rows.append(row)
+
+
+def _test_demo():
+    import numpy as np
+
+    from json_benchmarks.benchmarker import BenchmarkerResult, result_analysis
+    from json_benchmarks.benchmarker.benchmarker import Benchmarker
+
+    impl_lut = {
+        "numpy": np.sum,
+        "builtin": sum,
+    }
+
+    def data_lut(params):
+        item = 42 if params["dtype"] == "int" else 42.0
+        data = [item] * params["size"]
+        return data
+
+    basis = {
+        "impl": ["builtin", "numpy"],
+        "size": [10, 10000],
+        "dtype": ["int", "float"],
+    }
+
+    dpath = ub.Path.appdir("benchmarker/agg_demo").delete().ensuredir()
+
+    def run_one_benchmark():
+        self = Benchmarker(name="agg_demo", num=10, bestof=3, basis=basis)
+        for params in self.iter_params():
+            impl = impl_lut[params["impl"]]
+            data = data_lut(params)
+            for timer in self.measure():
+                with timer:
+                    impl(data)
+        fpath = self.dump_in_dpath(dpath)
+        return fpath
+
+    # Run the benchmark multiple times
+    fpaths = []
+    for _ in range(5):
+        fpath = run_one_benchmark()
+        fpaths.append(fpath)
+
+    results = []
+    for fpath in fpaths:
+        data = json.loads(fpath.read_text())
+        for row in data["rows"]:
+            result = BenchmarkerResult.load(fpath)
+            results.extend(result.to_result_list())
+
+    analysis = result_analysis.ResultAnalysis(
+        results,
+        metrics=["min", "mean"],
+        params=["impl"],
+        metric_objectives={
+            "min": "min",
+            "mean": "min",
+        },
+    )
+    analysis.analysis()
+    # single_df = pd.DataFrame(data['rows'])
+    # context = data['context']
+    # single_df
--- a/json_benchmarks/benchmarker/process_context.py
+++ b/json_benchmarks/benchmarker/process_context.py
@ -0,0 +1,123 @@
+import platform
+import socket
+import sys
+
+import ubelt as ub
+
+
+class ProcessContext:
+    """
+    Context manager to track the context under which a result was computed
+
+    Example:
+        >>> from json_benchmarks.benchmarker.process_context import *  # NOQA
+        >>> self = ProcessContext()
+        >>> obj = self.start().stop()
+        >>> print('obj = {}'.format(ub.repr2(obj, nl=2)))
+    """
+
+    def __init__(self, name=None, args=None, config=None):
+        if args is None:
+            args = sys.argv
+
+        self.obj = {
+            "type": "process_context",
+            "name": name,
+            "args": args,
+            "config": config,
+            "machine": None,
+            "start_timestamp": None,
+            "stop_timestamp": None,
+        }
+
+    def _timestamp(self):
+        import datetime
+
+        timestamp = (
+            datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc).isoformat()
+        )
+        timestamp = timestamp.replace(":", "")
+        # timestamp = ub.timestamp()
+        return timestamp
+
+    def _hostinfo(self):
+        return {
+            "host": socket.gethostname(),
+            "user": ub.Path(ub.userhome()).name,
+            # 'cwd': os.getcwd(),
+        }
+
+    def _osinfo(self):
+        (
+            uname_system,
+            _,
+            uname_release,
+            uname_version,
+            _,
+            uname_processor,
+        ) = platform.uname()
+        return {
+            "os_name": uname_system,
+            "os_release": uname_release,
+            "os_version": uname_version,
+            "arch": uname_processor,
+        }
+
+    def _pyinfo(self):
+        return {
+            "py_impl": platform.python_implementation(),
+            "py_version": sys.version.replace("\n", ""),
+        }
+
+    def _meminfo(self):
+        import psutil
+
+        # TODO: could collect memory info at start and stop and intermediate
+        # stages.  Here we just want info that is static wrt to the machine.
+        # For now, just get the total available.
+        svmem_info = psutil.virtual_memory()
+        return {
+            "mem_total": svmem_info.total,
+        }
+
+    def _cpuinfo(self):
+        import cpuinfo
+
+        _cpu_info = cpuinfo.get_cpu_info()
+        cpu_info = {
+            "cpu_brand": _cpu_info["brand_raw"],
+        }
+        return cpu_info
+
+    def _machine(self):
+        return ub.dict_union(
+            self._hostinfo(),
+            self._meminfo(),
+            self._cpuinfo(),
+            self._osinfo(),
+            self._pyinfo(),
+        )
+
+    def start(self):
+        self.obj.update(
+            {
+                "machine": self._machine(),
+                "start_timestamp": self._timestamp(),
+                "stop_timestamp": None,
+            }
+        )
+        return self
+
+    def stop(self):
+        self.obj.update(
+            {
+                "stop_timestamp": self._timestamp(),
+            }
+        )
+        return self.obj
+
+    def __enter__(self):
+        return self.start()
+
+    def __exit__(self, a, b, c):
+        self.stop()
--- a/json_benchmarks/benchmarker/result_analysis.py
+++ b/json_benchmarks/benchmarker/result_analysis.py
@ -0,0 +1,1089 @@
+import itertools as it
+import math
+import warnings
+
+import numpy as np
+import pandas as pd
+import scipy
+import scipy.stats  # NOQA
+import ubelt as ub
+
+# a list of common objectives
+DEFAULT_METRIC_TO_OBJECTIVE = {
+    "time": "min",
+    "ap": "max",
+    "acc": "max",
+    "f1": "max",
+    "mcc": "max",
+    #
+    "loss": "min",
+    "brier": "min",
+}
+
+
+class Result(ub.NiceRepr):
+    """
+    Storage of names, parameters, and quality metrics for a single experiment.
+
+    Attributes:
+        name (str | None):
+            Name of the experiment. Optional. This is unused in the analysis.
+            (i.e. names will never be used computationally. Use them for keys)
+
+        params (Dict[str, object]): configuration of the experiment.
+            This is a dictionary mapping a parameter name to its value.
+
+        metrics (Dict[str, float]): quantitative results of the experiment
+            This is a dictionary for each quality metric computed on this
+            result.
+
+        meta (Dict | None): any other metadata about this result.
+            This is unused in the analysis.
+
+    Example:
+        >>> self = Result.demo(rng=32)
+        >>> print('self = {}'.format(self))
+        self = <Result(name=53f57161,f1=0.33,acc=0.75,param1=1,param2=6.67,param3=a)>
+
+    Example:
+        >>> self = Result.demo(mode='alt', rng=32)
+        >>> print('self = {}'.format(self))
+    """
+
+    def __init__(self, name, params, metrics, meta=None):
+        self.name = name
+        self.params = params
+        self.metrics = metrics
+        self.meta = meta
+
+    def to_dict(self):
+        row = ub.dict_union({"name": self.name}, self.metrics, self.params)
+        return row
+
+    def __nice__(self):
+        row = self.to_dict()
+        text = ub.repr2(row, compact=True, precision=2, sort=0)
+        return text
+
+    @classmethod
+    def demo(cls, mode="null", rng=None):
+        import string
+
+        import kwarray
+        import numpy as np
+
+        rng = kwarray.ensure_rng(rng)
+
+        if mode == "null":
+            # The null hypothesis should generally be true here,
+            # there is no relation between the results and parameters
+            demo_param_space = {
+                "param1": list(range(3)),
+                "param2": np.linspace(0, 10, 10),
+                "param3": list(string.ascii_lowercase[0:3]),
+            }
+            params = {k: rng.choice(b) for k, b in demo_param_space.items()}
+            metrics = {
+                "f1": rng.rand(),
+                "acc": rng.rand(),
+            }
+        elif mode == "alt":
+            # The alternative hypothesis should be true here, there is a
+            # relationship between results two of the params.
+            from scipy.special import expit
+
+            params = {
+                "u": rng.randint(0, 1 + 1),
+                "v": rng.randint(-1, 1 + 1),
+                "x": rng.randint(-2, 3 + 1),
+                "y": rng.randint(-1, 2 + 1),
+                "z": rng.randint(-0, 3 + 1),
+            }
+            noise = np.random.randn() * 1
+            r = 3 * params["x"] + params["y"] ** 2 + 0.3 * params["z"] ** 3
+            acc = expit(r / 20 + noise)
+            metrics = {
+                "acc": acc,
+            }
+        else:
+            raise KeyError(mode)
+        name = ub.hash_data(params)[0:8]
+        self = cls(name, params, metrics)
+        return self
+
+
+class ResultAnalysis(ub.NiceRepr):
+    """
+    Groups and runs stats on results
+
+    Runs statistical tests on sets of configuration-metrics pairs
+
+    Attributes:
+        results (List[Result]): list of results
+
+        ignore_metrics (Set[str]): metrics to ignore
+
+        ignore_params (Set[str]): parameters to ignore
+
+        metric_objectives (Dict[str, str]):
+            indicate if each metrix should be maximized "max" or minimized
+            "min"
+
+        metrics (List[str]):
+            only consider these metrics
+
+        params (List[str]):
+            if given, only consider these params
+
+        abalation_orders (Set[int]):
+            The number of parameters to be held constant in each statistical
+            grouping. Defaults to 1, so it groups together results where 1
+            variable is held constant. Including 2 will include pairwise
+            settings of parameters to be held constant. Using -1 or -2 means
+            all but 1 or 2 parameters will be held constant, repsectively.
+
+        default_objective (str):
+            assume max or min for unknown metrics
+
+    Example:
+        >>> self = ResultAnalysis.demo()
+        >>> self.analysis()
+
+    Example:
+        >>> self = ResultAnalysis.demo(num=5000, mode='alt')
+        >>> self.analysis()
+
+    Example:
+        >>> # Given a list of experiments, configs, and results
+        >>> # Create a ResultAnalysis object
+        >>> results = ResultAnalysis([
+        >>>     Result('expt0', {'param1': 2, 'param3': 'b'}, {'f1': 0.75}),
+        >>>     Result('expt1', {'param1': 0, 'param3': 'c'}, {'f1': 0.92}),
+        >>>     Result('expt2', {'param1': 1, 'param3': 'b'}, {'f1': 0.77}),
+        >>>     Result('expt3', {'param1': 1, 'param3': 'a'}, {'f1': 0.67}),
+        >>>     Result('expt4', {'param1': 0, 'param3': 'c'}, {'f1': 0.98}),
+        >>>     Result('expt5', {'param1': 2, 'param3': 'a'}, {'f1': 0.86}),
+        >>>     Result('expt6', {'param1': 1, 'param3': 'c'}, {'f1': 0.77}),
+        >>>     Result('expt7', {'param1': 1, 'param3': 'c'}, {'f1': 0.41}),
+        >>>     Result('expt8', {'param1': 1, 'param3': 'a'}, {'f1': 0.64}),
+        >>>     Result('expt9', {'param1': 0, 'param3': 'b'}, {'f1': 0.95}),
+        >>> ])
+        >>> # Calling the analysis method prints something like the following
+        >>> results.analysis()
+
+        PARAMETER 'param1' - f1
+        =======================
+        f1       mean       std   max   min  num  best
+        param1
+        0       0.950  0.030000  0.98  0.92  3.0  0.98
+        2       0.805  0.077782  0.86  0.75  2.0  0.86
+        1       0.652  0.147377  0.77  0.41  5.0  0.77
+
+        ANOVA hypothesis (roughly): the param 'param1' has no effect on the metric
+            Reject this hypothesis if the p value is less than a threshold
+          Rank-ANOVA: p=0.0397
+          Mean-ANOVA: p=0.0277
+
+        Pairwise T-Tests
+          Is param1=0 about as good as param1=2?
+            ttest_ind:  p=0.2058
+          Is param1=1 about as good as param1=2?
+            ttest_ind:  p=0.1508
+
+
+        PARAMETER 'param3' - f1
+        =======================
+        f1          mean       std   max   min  num  best
+        param3
+        c       0.770000  0.255734  0.98  0.41  4.0  0.98
+        b       0.823333  0.110151  0.95  0.75  3.0  0.95
+        a       0.723333  0.119304  0.86  0.64  3.0  0.86
+
+        ANOVA hypothesis (roughly): the param 'param3' has no effect on the metric
+            Reject this hypothesis if the p value is less than a threshold
+          Rank-ANOVA: p=0.5890
+          Mean-ANOVA: p=0.8145
+
+        Pairwise T-Tests
+          Is param3=b about as good as param3=c?
+            ttest_ind:  p=0.7266
+          Is param3=a about as good as param3=b?
+            ttest_ind:  p=0.3466
+            ttest_rel:  p=0.3466
+          Is param3=a about as good as param3=c?
+            ttest_ind:  p=0.7626
+    """
+
+    def __init__(
+        self,
+        results,
+        metrics=None,
+        params=None,
+        ignore_params=None,
+        ignore_metrics=None,
+        metric_objectives=None,
+        abalation_orders={1},
+        default_objective="max",
+        p_threshold=0.05,
+    ):
+        self.results = results
+        if ignore_metrics is None:
+            ignore_metrics = set()
+        if ignore_params is None:
+            ignore_params = set()
+        self.ignore_params = ignore_params
+        self.ignore_metrics = ignore_metrics
+
+        self.abalation_orders = abalation_orders
+        self.default_objective = default_objective
+
+        # encode if we want to maximize or minimize a metric
+        if metric_objectives is None:
+            metric_objectives = {}
+        self.metric_objectives = DEFAULT_METRIC_TO_OBJECTIVE.copy()
+        self.metric_objectives.update(metric_objectives)
+
+        self.params = params
+        self.metrics = metrics
+        self.statistics = None
+        self.p_threshold = p_threshold
+
+        self._description = {}
+        self._description["built"] = False
+        self._description["num_results"] = len(self.results)
+
+    def __nice__(self):
+        return ub.repr2(self._description, si=1, sv=1)
+
+    @classmethod
+    def demo(cls, num=10, mode="null", rng=None):
+        import kwarray
+
+        rng = kwarray.ensure_rng(rng)
+        results = [Result.demo(mode=mode, rng=rng) for _ in range(num)]
+        if mode == "null":
+            self = cls(results, metrics={"f1", "acc"})
+        else:
+            self = cls(results, metrics={"acc"})
+        return self
+
+    def run(self):
+        self.build()
+        self.report()
+
+    def analysis(self):
+        # alias for run
+        return self.run()
+        self.build()
+        self.report()
+
+    @ub.memoize_property
+    def table(self):
+        rows = [r.to_dict() for r in self.results]
+        table = pd.DataFrame(rows)
+        return table
+
+    def metric_table(self):
+        rows = [r.to_dict() for r in self.results]
+        table = pd.DataFrame(rows)
+        return table
+
+    @ub.memoize_property
+    def varied(self):
+        config_rows = [r.params for r in self.results]
+        sentinel = object()
+        # pd.DataFrame(config_rows).channels
+        varied = dict(ub.varied_values(config_rows, default=sentinel, min_variations=1))
+        # remove nans
+        varied = {
+            k: {v for v in vs if not (isinstance(v, float) and math.isnan(v))}
+            for k, vs in varied.items()
+        }
+        varied = {k: vs for k, vs in varied.items() if len(vs)}
+        return varied
+
+    def abalation_groups(self, param_group, k=2):
+        """
+        Return groups where the specified parameter(s) are varied, but all
+        other non-ignored parameters are held the same.
+
+        Args:
+            param_group (str | List[str]):
+                One or more parameters that are allowed to vary
+
+            k (int):
+                minimum number of items a group must contain to be returned
+
+        Returns:
+            List[DataFrame]:
+                a list of subsets of in the table where all but the specified
+                (non-ignored) parameters are allowed to vary.
+
+        Example:
+            >>> self = ResultAnalysis.demo()
+            >>> param = 'param2'
+            >>> self.abalation_groups(param)
+        """
+        if not ub.iterable(param_group):
+            param_group = [param_group]
+        table = self.table
+        config_rows = [r.params for r in self.results]
+        config_keys = list(map(set, config_rows))
+        # if self.params:
+        #     config_keys = list(self.params)
+        if self.ignore_params:
+            config_keys = [c - self.ignore_params for c in config_keys]
+        isect_params = set.intersection(*config_keys)
+        other_params = sorted(isect_params - set(param_group))
+        groups = []
+        for key, group in table.groupby(other_params, dropna=False):
+            if len(group) >= k:
+                groups.append(group)
+        return groups
+
+    def _objective_is_ascending(self, metric_key):
+        """
+        Args:
+            metric_key (str): the metric in question
+
+        Returns:
+            bool:
+                True if we should minimize the objective (lower is better)
+                False if we should maximize the objective (higher is better)
+        """
+        objective = self.metric_objectives.get(metric_key, None)
+        if objective is None:
+            warnings.warn(f"warning assume {self.default_objective} for {metric_key=}")
+            objective = self.default_objective
+        ascending = objective == "min"
+        return ascending
+
+    def abalate(self, param_group):
+        """
+        TODO:
+            rectify with test-group
+
+        Example:
+            >>> self = ResultAnalysis.demo(100)
+            >>> param = 'param2'
+            >>> # xdoctest: +REQUIRES(module:openskill)
+            >>> self.abalate(param)
+
+            >>> self = ResultAnalysis.demo()
+            >>> param_group = ['param2', 'param3']
+            >>> # xdoctest: +REQUIRES(module:openskill)
+            >>> self.abalate(param_group)
+        """
+        if self.table is None:
+            self.table = self.build_table()
+        if not ub.iterable(param_group):
+            param_group = [param_group]
+
+        # For hashable generic dictionary
+        from collections import namedtuple
+
+        gd = namedtuple("config", param_group)
+
+        # from types import SimpleNamespace
+        param_unique_vals_ = (
+            self.table[param_group].drop_duplicates().to_dict("records")
+        )
+        param_unique_vals = [gd(**d) for d in param_unique_vals_]
+        # param_unique_vals = {p: self.table[p].unique().tolist() for p in param_group}
+        score_improvements = ub.ddict(list)
+        scored_obs = []
+        skillboard = SkillTracker(param_unique_vals)
+        groups = self.abalation_groups(param_group, k=2)
+
+        for group in groups:
+            for metric_key in self.metrics:
+                ascending = self._objective_is_ascending(metric_key)
+
+                group = group.sort_values(metric_key, ascending=ascending)
+                subgroups = group.groupby(param_group)
+                if ascending:
+                    best_idx = subgroups[metric_key].idxmax()
+                else:
+                    best_idx = subgroups[metric_key].idxmin()
+                best_group = group.loc[best_idx]
+                best_group = best_group.sort_values(metric_key, ascending=ascending)
+
+                for x1, x2 in it.product(best_group.index, best_group.index):
+                    if x1 != x2:
+                        r1 = best_group.loc[x1]
+                        r2 = best_group.loc[x2]
+                        k1 = gd(**r1[param_group])
+                        k2 = gd(**r2[param_group])
+                        diff = r1[metric_key] - r2[metric_key]
+                        score_improvements[(k1, k2, metric_key)].append(diff)
+
+                # metric_vals = best_group[metric_key].values
+                # diffs = metric_vals[None, :] - metric_vals[:, None]
+                best_group.set_index(param_group)
+                # best_group[param_group]
+                # best_group[metric_key].diff()
+                scored_ranking = best_group[param_group + [metric_key]].reset_index(
+                    drop=True
+                )
+                scored_obs.append(scored_ranking)
+                ranking = [
+                    gd(**d) for d in scored_ranking[param_group].to_dict("records")
+                ]
+                skillboard.observe(ranking)
+
+        print(
+            "skillboard.ratings = {}".format(
+                ub.repr2(skillboard.ratings, nl=1, align=":")
+            )
+        )
+        win_probs = skillboard.predict_win()
+        print(f"win_probs = {ub.repr2(win_probs, nl=1)}")
+        for key, improves in score_improvements.items():
+            k1, k2, metric_key = key
+            improves = np.array(improves)
+            pos_delta = improves[improves > 0]
+            print(
+                f"\nWhen {k1} is better than {k2}, the improvement in {metric_key} is"
+            )
+            print(pd.DataFrame([pd.Series(pos_delta).describe().T]))
+        return scored_obs
+
+    def test_group(self, param_group, metric_key):
+        """
+        Get stats for a particular metric / constant group
+
+        Args:
+            param_group (List[str]): group of parameters to hold constant.
+            metric_key (str): The metric to test.
+
+        Returns:
+            dict
+            # TODO : document these stats clearly and accurately
+
+        Example:
+            >>> self = ResultAnalysis.demo(num=100)
+            >>> print(self.table)
+            >>> param_group = ['param2', 'param1']
+            >>> metric_key = 'f1'
+            >>> stats_row = self.test_group(param_group, metric_key)
+            >>> print('stats_row = {}'.format(ub.repr2(stats_row, nl=2, sort=0, precision=2)))
+        """
+        param_group_name = ",".join(param_group)
+        stats_row = {
+            "param_name": param_group_name,
+            "metric": metric_key,
+        }
+        # param_values = varied[param_name]
+        # stats_row['param_values'] = param_values
+        ascending = self._objective_is_ascending(metric_key)
+
+        # Find all items with this particular param value
+        value_to_metric_group = {}
+        value_to_metric_stats = {}
+        value_to_metric = {}
+
+        varied_cols = sorted(self.varied.keys())
+
+        # Not sure if this is the right name, these are the other param keys
+        # that we are not directly investigating, but might have an impact.
+        # We use these to select comparable rows for pairwise t-tests
+        nuisance_cols = sorted(set(self.varied.keys()) - set(param_group))
+
+        for param_value, group in self.table.groupby(param_group):
+            metric_group = group[["name", metric_key] + varied_cols]
+            metric_vals = metric_group[metric_key]
+            metric_vals = metric_vals.dropna()
+            if len(metric_vals) > 0:
+                metric_stats = metric_vals.describe()
+                value_to_metric_stats[param_value] = metric_stats
+                value_to_metric_group[param_value] = metric_group
+                value_to_metric[param_value] = metric_vals.values
+
+        moments = pd.DataFrame(value_to_metric_stats).T
+        moments = moments.sort_values("mean", ascending=ascending)
+        moments.index.name = param_group_name
+        moments.columns.name = metric_key
+        ranking = moments["mean"].index.to_list()
+        param_to_rank = ub.invert_dict(dict(enumerate(ranking)))
+
+        # Determine a set of value pairs to do pairwise comparisons on
+        value_pairs = ub.oset()
+        # value_pairs.update(
+        #     map(frozenset, ub.iter_window(moments.index, 2)))
+        value_pairs.update(
+            map(
+                frozenset,
+                ub.iter_window(
+                    moments.sort_values("mean", ascending=ascending).index, 2
+                ),
+            )
+        )
+
+        # https://en.wikipedia.org/wiki/Kruskal%E2%80%93Wallis_one-way_analysis_of_variance
+        # If the researcher can make the assumptions of an identically
+        # shaped and scaled distribution for all groups, except for any
+        # difference in medians, then the null hypothesis is that the
+        # medians of all groups are equal, and the alternative
+        # hypothesis is that at least one population median of one
+        # group is different from the population median of at least one
+        # other group.
+        try:
+            anova_krus_result = scipy.stats.kruskal(*value_to_metric.values())
+        except ValueError:
+            anova_krus_result = scipy.stats.stats.KruskalResult(np.nan, np.nan)
+
+        # https://en.wikipedia.org/wiki/One-way_analysis_of_variance
+        # The One-Way ANOVA tests the null hypothesis, which states
+        # that samples in all groups are drawn from populations with
+        # the same mean values
+        if len(value_to_metric) > 1:
+            anova_1way_result = scipy.stats.f_oneway(*value_to_metric.values())
+        else:
+            anova_1way_result = scipy.stats.stats.F_onewayResult(np.nan, np.nan)
+
+        stats_row["anova_rank_H"] = anova_krus_result.statistic
+        stats_row["anova_rank_p"] = anova_krus_result.pvalue
+        stats_row["anova_mean_F"] = anova_1way_result.statistic
+        stats_row["anova_mean_p"] = anova_1way_result.pvalue
+        stats_row["moments"] = moments
+
+        pair_stats_list = []
+        for pair in value_pairs:
+            pair_stats = {}
+            param_val1, param_val2 = pair
+
+            metric_vals1 = value_to_metric[param_val1]
+            metric_vals2 = value_to_metric[param_val2]
+
+            rank1 = param_to_rank[param_val1]
+            rank2 = param_to_rank[param_val2]
+            pair_stats["winner"] = param_val1 if rank1 < rank2 else param_val2
+            pair_stats["value1"] = param_val1
+            pair_stats["value2"] = param_val2
+            pair_stats["n1"] = len(metric_vals1)
+            pair_stats["n2"] = len(metric_vals2)
+
+            TEST_ONLY_FOR_DIFFERENCE = True
+            if TEST_ONLY_FOR_DIFFERENCE:
+                if ascending:
+                    # We want to minimize the metric
+                    alternative = "less" if rank1 < rank2 else "greater"
+                else:
+                    # We want to maximize the metric
+                    alternative = "greater" if rank1 < rank2 else "less"
+            else:
+                alternative = "two-sided"
+
+            ind_kw = dict(
+                equal_var=False,
+                alternative=alternative,
+            )
+            ttest_ind_result = scipy.stats.ttest_ind(
+                metric_vals1, metric_vals2, **ind_kw
+            )
+
+            if 0:
+                from benchmarker.benchmarker import stats_dict
+
+                stats1 = stats_dict(metric_vals1)
+                stats2 = stats_dict(metric_vals2)
+                scipy.stats.ttest_ind_from_stats(
+                    stats1["mean"],
+                    stats1["std"],
+                    stats1["nobs"],
+                    stats2["mean"],
+                    stats2["std"],
+                    stats2["nobs"],
+                    **ind_kw,
+                )
+                # metric_vals1, metric_vals2, equal_var=False)
+
+            scipy.stats.ttest_ind_from_stats
+
+            pair_stats["ttest_ind"] = ttest_ind_result
+
+            # Do relative checks, need to find comparable subgroups
+            metric_group1 = value_to_metric_group[param_val1]
+            metric_group2 = value_to_metric_group[param_val2]
+            nuisance_vals1 = metric_group1[nuisance_cols]
+            nuisance_vals2 = metric_group2[nuisance_cols]
+            nk_to_group1 = dict(list(nuisance_vals1.groupby(nuisance_cols)))
+            nk_to_group2 = dict(list(nuisance_vals2.groupby(nuisance_cols)))
+            common = set(nk_to_group1) & set(nk_to_group2)
+            comparable_indexes1 = []
+            comparable_indexes2 = []
+            if common:
+                for nk in common:
+                    group1 = nk_to_group1[nk]
+                    group2 = nk_to_group2[nk]
+                    # TODO: Not sure if taking the product of everything within
+                    # the comparable group is correct or not. I think it is ok.
+                    for i, j in it.product(group1.index, group2.index):
+                        comparable_indexes1.append(i)
+                        comparable_indexes2.append(j)
+
+                comparable_groups1 = metric_group1.loc[comparable_indexes1, metric_key]
+                comparable_groups2 = metric_group2.loc[comparable_indexes2, metric_key]
+
+                # Does this need to have the values aligned?
+                # I think that is the case giving my understanding of paired
+                # t-tests, but the docs need a PR to make that more clear.
+                ttest_rel_result = scipy.stats.ttest_rel(
+                    comparable_groups1, comparable_groups2
+                )
+                pair_stats["n_common"] = len(common)
+                pair_stats["ttest_rel"] = ttest_rel_result
+            pair_stats_list.append(pair_stats)
+
+        stats_row["pairwise"] = pair_stats_list
+        return stats_row
+
+    def build(self):
+        import itertools as it
+
+        if len(self.results) < 2:
+            raise Exception("need at least 2 results")
+
+        varied = self.varied.copy()
+        if self.ignore_params:
+            for k in self.ignore_params:
+                varied.pop(k, None)
+        if self.params:
+            varied = ub.dict_isect(varied, self.params)
+
+        # Experimental:
+        # Find Auto-abalation groups
+        # TODO: when the group size is -1, instead of showing all of the group
+        # settings, for each group setting do the k=1 analysis within that group
+        varied_param_names = list(varied.keys())
+        num_varied_params = len(varied)
+        held_constant_orders = {
+            num_varied_params + i if i < 0 else i for i in self.abalation_orders
+        }
+        held_constant_orders = [i for i in held_constant_orders if i > 0]
+        held_constant_groups = []
+        for k in held_constant_orders:
+            held_constant_groups.extend(
+                list(map(list, it.combinations(varied_param_names, k)))
+            )
+
+        if self.metrics is None:
+            avail_metrics = set.intersection(
+                *[set(r.metrics.keys()) for r in self.results]
+            )
+            metrics_of_interest = sorted(avail_metrics - set(self.ignore_metrics))
+        else:
+            metrics_of_interest = self.metrics
+        self.metrics_of_interest = metrics_of_interest
+        self._description["metrics_of_interest"] = metrics_of_interest
+        self._description["num_groups"] = len(held_constant_groups)
+
+        # Analyze the impact of each parameter
+        self.statistics = statistics = []
+        for param_group in held_constant_groups:
+            for metric_key in metrics_of_interest:
+                stats_row = self.test_group(param_group, metric_key)
+                statistics.append(stats_row)
+
+        self.stats_table = pd.DataFrame(
+            [
+                ub.dict_diff(d, {"pairwise", "param_values", "moments"})
+                for d in self.statistics
+            ]
+        )
+
+        if len(self.stats_table):
+            self.stats_table = self.stats_table.sort_values("anova_rank_p")
+
+        self._description["built"] = True
+
+    def report(self):
+        stat_groups = ub.group_items(self.statistics, key=lambda x: x["param_name"])
+        stat_groups_items = list(stat_groups.items())
+
+        # Modify this order to change the grouping pattern
+        grid = ub.named_product(
+            {
+                "stat_group_item": stat_groups_items,
+                "metrics": self.metrics_of_interest,
+            }
+        )
+        for grid_item in grid:
+            self._report_one(grid_item)
+
+        print(self.stats_table)
+
+    def _report_one(self, grid_item):
+        p_threshold = self.p_threshold
+        metric_key = grid_item["metrics"]
+        stat_groups_item = grid_item["stat_group_item"]
+
+        param_name, stat_group = stat_groups_item
+        stats_row = ub.group_items(stat_group, key=lambda x: x["metric"])[metric_key][0]
+        title = f"PARAMETER: {param_name} - METRIC: {metric_key}"
+        print("\n\n")
+        print(title)
+        print("=" * len(title))
+        print(stats_row["moments"])
+        anova_rank_p = stats_row["anova_rank_p"]
+        anova_mean_p = stats_row["anova_mean_p"]
+        # Rougly speaking
+        print("")
+        print(f"ANOVA: If p is low, the param {param_name!r} might have an effect")
+        print(
+            ub.color_text(
+                f"  Rank-ANOVA: p={anova_rank_p:0.8f}",
+                "green" if anova_rank_p < p_threshold else None,
+            )
+        )
+        print(
+            ub.color_text(
+                f"  Mean-ANOVA: p={anova_mean_p:0.8f}",
+                "green" if anova_mean_p < p_threshold else None,
+            )
+        )
+        print("")
+        print("Pairwise T-Tests")
+        for pairstat in stats_row["pairwise"]:
+            # Is this backwards?
+            value1 = pairstat["value1"]
+            value2 = pairstat["value2"]
+            winner = pairstat["winner"]
+            if value2 == winner:
+                value1, value2 = value2, value1
+            print(
+                f"  If p is low, {param_name}={value1} may outperform {param_name}={value2}."
+            )
+            if "ttest_ind" in pairstat:
+                ttest_ind_result = pairstat["ttest_ind"]
+                print(
+                    ub.color_text(
+                        f"    ttest_ind:  p={ttest_ind_result.pvalue:0.8f}",
+                        "green" if ttest_ind_result.pvalue < p_threshold else None,
+                    )
+                )
+            if "ttest_rel" in pairstat:
+                n_common = pairstat["n_common"]
+                ttest_rel_result = pairstat["ttest_ind"]
+                print(
+                    ub.color_text(
+                        f"    ttest_rel:  p={ttest_rel_result.pvalue:0.8f}, n_pairs={n_common}",
+                        "green" if ttest_rel_result.pvalue < p_threshold else None,
+                    )
+                )
+
+    def conclusions(self):
+        conclusions = []
+        for stat in self.statistics:
+            param_name = stat["param_name"]
+            metric = stat["metric"]
+            for pairstat in stat["pairwise"]:
+                value1 = pairstat["value1"]
+                value2 = pairstat["value2"]
+                winner = pairstat["winner"]
+                if value2 == winner:
+                    value1, value2 = value2, value1
+                pvalue = stat = pairstat["ttest_ind"].pvalue
+                txt = f"p={pvalue:0.8f}, If p is low, {param_name}={value1} may outperform {value2} on {metric}."
+                conclusions.append(txt)
+        return conclusions
+
+    def plot(self, xlabel, metric_key, group_labels, data=None, **kwargs):
+        """
+        Args:
+            group_labels (dict):
+                Tells seaborn what attributes to use to distinsuish curves like
+                hue, size, marker. Also can contain "col" for use with
+                FacetGrid, and "fig" to separate different configurations into
+                different figures.
+
+        Returns:
+            List[Dict]:
+                A list for each figure containing info abou that figure for any
+                postprocessing.
+
+        Example:
+            >>> self = ResultAnalysis.demo(num=1000, mode='alt')
+            >>> self.analysis()
+            >>> print('self = {}'.format(self))
+            >>> print('self.varied = {}'.format(ub.repr2(self.varied, nl=1)))
+            >>> # xdoctest: +REQUIRES(module:kwplot)
+            >>> import kwplot
+            >>> kwplot.autosns()
+            >>> xlabel = 'x'
+            >>> metric_key = 'acc'
+            >>> group_labels = {
+            >>>     'fig': ['u'],
+            >>>     'col': ['y', 'v'],
+            >>>     'hue': ['z'],
+            >>>     'size': [],
+            >>> }
+            >>> kwargs = {'xscale': 'log', 'yscale': 'log'}
+            >>> self.plot(xlabel, metric_key, group_labels, **kwargs)
+        """
+        print("Init seaborn and pyplot")
+        import seaborn as sns
+
+        sns.set()
+        from matplotlib import pyplot as plt  # NOQA
+
+        print("Starting plot")
+
+        if data is None:
+            data = self.table
+        data = data.sort_values(metric_key)
+
+        print("Compute group labels")
+        for gname, labels in group_labels.items():
+            if len(labels):
+                new_col = []
+                for row in data[labels].to_dict("records"):
+                    item = ub.repr2(row, compact=1, si=1)
+                    new_col.append(item)
+                gkey = gname + "_key"
+                data[gkey] = new_col
+
+        plot_kws = {
+            "x": xlabel,
+            "y": metric_key,
+        }
+        for gname, labels in group_labels.items():
+            if labels:
+                plot_kws[gname] = gname + "_key"
+
+        # Your variables may change
+        # ax = plt.figure().gca()
+        fig_params = plot_kws.pop("fig", [])
+
+        facet_kws = {
+            "sharex": True,
+            "sharey": True,
+        }
+        # facet_kws['col'] = plot_kws.pop("col", None)
+        # facet_kws['row'] = plot_kws.pop("row", None)
+        # if not facet_kws['row']:
+        #     facet_kws['col_wrap'] = 5
+        plot_kws["row"] = plot_kws.get("row", None)
+        # if not plot_kws['row']:
+        #     plot_kws['col_wrap'] = 5
+
+        if not fig_params:
+            groups = [("", data)]
+        else:
+            groups = data.groupby(fig_params)
+
+        if "marker" not in plot_kws:
+            plot_kws["marker"] = "o"
+
+        # We will want to overwrite this with our own std estimate
+        plot_kws["ci"] = "sd"
+        # err_style='band',
+        # err_kws=None,
+
+        # Use a consistent pallete across plots
+        unique_hues = data["hue_key"].unique()
+        palette = ub.dzip(unique_hues, sns.color_palette(n_colors=len(unique_hues)))
+        plot_kws["palette"] = palette
+
+        # kwplot.close_figures()
+
+        plots = []
+        base_fnum = 1
+        print("Start plots")
+        # hack
+        hack_groups = [(k, v) for k, v in groups if k != "input=Complex object"]
+
+        for fnum, (fig_key, group) in enumerate(hack_groups, start=base_fnum):
+            # TODO: seaborn doesn't give us any option to reuse an existing
+            # figure or even specify what it's handle should be. A patch should
+            # be submitted to add that feature, but in the meantime work around
+            # it and use the figures they give us.
+
+            # fig = plt.figure(fnum)
+            # fig.clf()
+
+            facet = sns.relplot(
+                data=group,
+                kind="line",
+                # kind="scatter",
+                facet_kws=facet_kws,
+                **plot_kws,
+            )
+            from json_benchmarks.benchmarker.util_stats import aggregate_stats
+
+            # print(f'facet._col_var={facet._col_var}')
+            if facet._col_var is not None:
+                facet_data_groups = dict(list(facet.data.groupby(facet._col_var)))
+            else:
+                facet_data_groups = None
+            # facet_data_group_iter = iter(facet_data_groups.keys())
+
+            for ax in facet.axes.ravel():
+                col_key = ax.get_title().split("=", 1)[-1].strip()
+                # col_key = next(facet_data_group_iter)
+                if facet_data_groups is not None:
+                    col_data = facet_data_groups[col_key]
+                else:
+                    col_data = facet.data
+                col_data["mean_time"]
+                col_data["std_time"]
+                xlabel = plot_kws["x"]
+                ylabel = plot_kws["y"]
+                subgroups = col_data.groupby(plot_kws["hue"])
+                for subgroup_key, subgroup in subgroups:
+                    # combine stds in multiple groups on the x and manually draw errors
+                    suffix = "_" + ylabel.partition("_")[2]
+                    if "mean_" in ylabel:
+                        std_label = ylabel.replace("mean_", "std_")
+                        combo_group = aggregate_stats(
+                            subgroup, suffix=suffix, group_keys=[plot_kws["x"]]
+                        )
+                        _xdata = combo_group[xlabel].values
+                        _ydata_mean = combo_group[ylabel].values
+                        _ydata_std = combo_group[std_label].values
+                        std_label = ylabel.replace("mean_", "std_")
+
+                        # Plot bars 3 standard deviations from the mean to
+                        # get a 99.7% interval
+                        num_std = 3
+                        y_data_min = _ydata_mean - num_std * _ydata_std
+                        y_data_max = _ydata_mean + num_std * _ydata_std
+                        spread_alpha = 0.3
+                        color = palette[subgroup_key]
+                        ax.fill_between(
+                            _xdata,
+                            y_data_min,
+                            y_data_max,
+                            alpha=spread_alpha,
+                            color=color,
+                            zorder=1,
+                        )
+                    # zorder=0)
+
+            xscale = kwargs.get("xscale", None)
+            yscale = kwargs.get("yscale", None)
+            for ax in facet.axes.ravel():
+                if xscale is not None:
+                    try:
+                        ax.set_xscale(xscale)
+                    except ValueError:
+                        pass
+                if yscale is not None:
+                    try:
+                        ax.set_yscale(yscale)
+                    except ValueError:
+                        pass
+
+            fig = facet.figure
+            fig.suptitle(fig_key)
+            fig.tight_layout()
+            # facet = sns.FacetGrid(group, **facet_kws)
+            # facet.map_dataframe(sns.lineplot, x=xlabel, y=metric_key, **plot_kws)
+            # facet.add_legend()
+
+            plot = {
+                "fig": fig,
+                "facet": facet,
+            }
+            plots.append(plot)
+
+            # if fnum >= 1:
+            #     break
+
+        # print("Adjust plots")
+        # for plot in plots:
+        #     xscale = kwargs.get("xscale", None)
+        #     yscale = kwargs.get("yscale", None)
+        #     facet = plot["facet"]
+
+        #     facet_data_groups = dict(list(facet.data.groupby(facet._col_var)))
+        #     facet_data_group_iter = iter(facet_data_groups.keys())
+
+        #     for ax in facet.axes.ravel():
+
+        #         if xscale is not None:
+        #             try:
+        #                 ax.set_xscale(xscale)
+        #             except ValueError:
+        #                 pass
+        #         if yscale is not None:
+        #             try:
+        #                 ax.set_yscale(yscale)
+        #             except ValueError:
+        #                 pass
+        print("Finish")
+        return plots
+
+
+class SkillTracker:
+    """
+    Wrapper around openskill
+
+    Args:
+        player_ids (List[T]):
+            a list of ids (usually ints) used to represent each player
+
+    Example:
+        >>> # xdoctest: +REQUIRES(module:openskill)
+        >>> self = SkillTracker([1, 2, 3, 4, 5])
+        >>> self.observe([2, 3])  # Player 2 beat player 3.
+        >>> self.observe([1, 2, 5, 3])  # Player 3 didnt play this round.
+        >>> self.observe([2, 3, 4, 5, 1])  # Everyone played, player 2 won.
+        >>> win_probs = self.predict_win()
+        >>> print('win_probs = {}'.format(ub.repr2(win_probs, nl=1, precision=2)))
+        win_probs = {
+            1: 0.20,
+            2: 0.21,
+            3: 0.19,
+            4: 0.20,
+            5: 0.20,
+        }
+
+    Requirements:
+        openskill
+    """
+
+    def __init__(self, player_ids):
+        import openskill
+
+        self.player_ids = player_ids
+        self.ratings = {m: openskill.Rating() for m in player_ids}
+        # self.observations = []
+
+    def predict_win(self):
+        """
+        Estimate the probability that a particular player will win given the
+        current ratings.
+
+        Returns:
+            Dict[T, float]: mapping from player ids to win probabilites
+        """
+        from openskill import predict_win
+
+        teams = [[p] for p in list(self.ratings.keys())]
+        ratings = [[r] for r in self.ratings.values()]
+        probs = predict_win(ratings)
+        win_probs = {team[0]: prob for team, prob in zip(teams, probs)}
+        return win_probs
+
+    def observe(self, ranking):
+        """
+        After simulating a round, pass the ranked order of who won
+        (winner is first, looser is last) to this function. And it
+        updates the rankings.
+
+        Args:
+            ranking (List[T]):
+                ranking of all the players that played in this round
+                winners are at the front (0-th place) of the list.
+        """
+        import openskill
+
+        # self.observations.append(ranking)
+        ratings = self.ratings
+        team_standings = [[r] for r in ub.take(ratings, ranking)]
+        # new_values = openskill.rate(team_standings)  # Not inplace
+        # new_ratings = [openskill.Rating(*new[0]) for new in new_values]
+        new_team_ratings = openskill.rate(team_standings)
+        new_ratings = [new[0] for new in new_team_ratings]
+        ratings.update(ub.dzip(ranking, new_ratings))
--- a/json_benchmarks/benchmarker/util_json.py
+++ b/json_benchmarks/benchmarker/util_json.py
@ -0,0 +1,240 @@
+import copy
+import json
+import pathlib
+from collections import OrderedDict
+
+import numpy as np
+import ubelt as ub
+
+
+def ensure_json_serializable(dict_, normalize_containers=False, verbose=0):
+    """
+    Attempt to convert common types (e.g. numpy) into something json complient
+
+    Convert numpy and tuples into lists
+
+    Args:
+        normalize_containers (bool, default=False):
+            if True, normalizes dict containers to be standard python
+            structures.
+
+    Example:
+        >>> data = ub.ddict(lambda: int)
+        >>> data['foo'] = ub.ddict(lambda: int)
+        >>> data['bar'] = np.array([1, 2, 3])
+        >>> data['foo']['a'] = 1
+        >>> data['foo']['b'] = (1, np.array([1, 2, 3]), {3: np.int32(3), 4: np.float16(1.0)})
+        >>> dict_ = data
+        >>> print(ub.repr2(data, nl=-1))
+        >>> assert list(find_json_unserializable(data))
+        >>> result = ensure_json_serializable(data, normalize_containers=True)
+        >>> print(ub.repr2(result, nl=-1))
+        >>> assert not list(find_json_unserializable(result))
+        >>> assert type(result) is dict
+    """
+    dict_ = copy.deepcopy(dict_)
+
+    def _norm_container(c):
+        if isinstance(c, dict):
+            # Cast to a normal dictionary
+            if isinstance(c, OrderedDict):
+                if type(c) is not OrderedDict:
+                    c = OrderedDict(c)
+            else:
+                if type(c) is not dict:
+                    c = dict(c)
+        return c
+
+    walker = ub.IndexableWalker(dict_)
+    for prefix, value in walker:
+        if isinstance(value, tuple):
+            new_value = list(value)
+            walker[prefix] = new_value
+        elif isinstance(value, np.ndarray):
+            new_value = value.tolist()
+            walker[prefix] = new_value
+        elif isinstance(value, (np.integer)):
+            new_value = int(value)
+            walker[prefix] = new_value
+        elif isinstance(value, (np.floating)):
+            new_value = float(value)
+            walker[prefix] = new_value
+        elif isinstance(value, (np.complexfloating)):
+            new_value = complex(value)
+            walker[prefix] = new_value
+        elif isinstance(value, pathlib.Path):
+            new_value = str(value)
+            walker[prefix] = new_value
+        elif hasattr(value, "__json__"):
+            new_value = value.__json__()
+            walker[prefix] = new_value
+        elif normalize_containers:
+            if isinstance(value, dict):
+                new_value = _norm_container(value)
+                walker[prefix] = new_value
+
+    if normalize_containers:
+        # normalize the outer layer
+        dict_ = _norm_container(dict_)
+    return dict_
+
+
+def find_json_unserializable(data, quickcheck=False):
+    """
+    Recurse through json datastructure and find any component that
+    causes a serialization error. Record the location of these errors
+    in the datastructure as we recurse through the call tree.
+
+    Args:
+        data (object): data that should be json serializable
+        quickcheck (bool): if True, check the entire datastructure assuming
+            its ok before doing the python-based recursive logic.
+
+    Returns:
+        List[Dict]: list of "bad part" dictionaries containing items
+            'value' - the value that caused the serialization error
+            'loc' - which contains a list of key/indexes that can be used
+                    to lookup the location of the unserializable value.
+                    If the "loc" is a list, then it indicates a rare case where
+                    a key in a dictionary is causing the serialization error.
+
+    Example:
+        >>> part = ub.ddict(lambda: int)
+        >>> part['foo'] = ub.ddict(lambda: int)
+        >>> part['bar'] = np.array([1, 2, 3])
+        >>> part['foo']['a'] = 1
+        >>> # Create a dictionary with two unserializable parts
+        >>> data = [1, 2, {'nest1': [2, part]}, {frozenset({'badkey'}): 3, 2: 4}]
+        >>> parts = list(find_json_unserializable(data))
+        >>> print('parts = {}'.format(ub.repr2(parts, nl=1)))
+        >>> # Check expected structure of bad parts
+        >>> assert len(parts) == 2
+        >>> part = parts[1]
+        >>> assert list(part['loc']) == [2, 'nest1', 1, 'bar']
+        >>> # We can use the "loc" to find the bad value
+        >>> for part in parts:
+        >>>     # "loc" is a list of directions containing which keys/indexes
+        >>>     # to traverse at each descent into the data structure.
+        >>>     directions = part['loc']
+        >>>     curr = data
+        >>>     special_flag = False
+        >>>     for key in directions:
+        >>>         if isinstance(key, list):
+        >>>             # special case for bad keys
+        >>>             special_flag = True
+        >>>             break
+        >>>         else:
+        >>>             # normal case for bad values
+        >>>             curr = curr[key]
+        >>>     if special_flag:
+        >>>         assert part['data'] in curr.keys()
+        >>>         assert part['data'] is key[1]
+        >>>     else:
+        >>>         assert part['data'] is curr
+    """
+    needs_check = True
+    if quickcheck:
+        try:
+            # Might be a more efficient way to do this check. We duplicate a lot of
+            # work by doing the check for unserializable data this way.
+            json.dumps(data)
+        except Exception:
+            # If there is unserializable data, find out where it is.
+            # is_serializable = False
+            pass
+        else:
+            # is_serializable = True
+            needs_check = False
+
+    if needs_check:
+        # mode = 'new'
+        # if mode == 'new':
+        scalar_types = (int, float, str, type(None))
+        container_types = (tuple, list, dict)
+        serializable_types = scalar_types + container_types
+        walker = ub.IndexableWalker(data)
+        for prefix, value in walker:
+            *root, key = prefix
+            if not isinstance(key, scalar_types):
+                # Special case where a dict key is the error value
+                # Purposely make loc non-hashable so its not confused with
+                # an address. All we can know in this case is that they key
+                # is at this level, there is no concept of where.
+                yield {"loc": root + [[".keys", key]], "data": key}
+            elif not isinstance(value, serializable_types):
+                yield {"loc": prefix, "data": value}
+
+
+def indexable_allclose(dct1, dct2, return_info=False):
+    """
+    Walks through two nested data structures and ensures that everything is
+    roughly the same.
+
+    Args:
+        dct1: a nested indexable item
+        dct2: a nested indexable item
+
+    Example:
+        >>> dct1 = {
+        >>>     'foo': [1.222222, 1.333],
+        >>>     'bar': 1,
+        >>>     'baz': [],
+        >>> }
+        >>> dct2 = {
+        >>>     'foo': [1.22222, 1.333],
+        >>>     'bar': 1,
+        >>>     'baz': [],
+        >>> }
+        >>> assert indexable_allclose(dct1, dct2)
+    """
+    walker1 = ub.IndexableWalker(dct1)
+    walker2 = ub.IndexableWalker(dct2)
+    flat_items1 = [
+        (path, value)
+        for path, value in walker1
+        if not isinstance(value, walker1.indexable_cls) or len(value) == 0
+    ]
+    flat_items2 = [
+        (path, value)
+        for path, value in walker2
+        if not isinstance(value, walker1.indexable_cls) or len(value) == 0
+    ]
+
+    flat_items1 = sorted(flat_items1)
+    flat_items2 = sorted(flat_items2)
+
+    if len(flat_items1) != len(flat_items2):
+        info = {"faillist": ["length mismatch"]}
+        final_flag = False
+    else:
+        passlist = []
+        faillist = []
+
+        for t1, t2 in zip(flat_items1, flat_items2):
+            p1, v1 = t1
+            p2, v2 = t2
+            assert p1 == p2
+
+            flag = v1 == v2
+            if not flag:
+                if (
+                    isinstance(v1, float)
+                    and isinstance(v2, float)
+                    and np.isclose(v1, v2)
+                ):
+                    flag = True
+            if flag:
+                passlist.append(p1)
+            else:
+                faillist.append((p1, v1, v2))
+
+        final_flag = len(faillist) == 0
+        info = {
+            "passlist": passlist,
+            "faillist": faillist,
+        }
+
+    if return_info:
+        return final_flag, info
+    else:
+        return final_flag
--- a/json_benchmarks/benchmarker/util_stats.py
+++ b/json_benchmarks/benchmarker/util_stats.py
@ -0,0 +1,235 @@
+import numpy as np
+import ubelt as ub
+
+
+def __tabulate_issue():
+    # MWE for tabulate issue
+    # The decimals are not aligned when using "," in the floatfmt
+    import tabulate
+
+    data = [
+        [
+            13213.2,
+            3213254.23,
+            432432.231,
+        ],
+        [432432.0, 432.3, 3.2],
+    ]
+    print(tabulate.tabulate(data, headers=["a", "b"], floatfmt=",.02f"))
+    print(tabulate.tabulate(data, headers=["a", "b"], floatfmt=".02f"))
+
+
+def __groupby_issue():
+    # MWE of an issue with pandas groupby
+    import pandas as pd
+
+    data = pd.DataFrame(
+        [
+            {"p1": "a", "p2": 1, "p3": 0},
+            {"p1": "a", "p2": 1, "p3": 0},
+            {"p1": "a", "p2": 2, "p3": 0},
+            {"p1": "b", "p2": 2, "p3": 0},
+            {"p1": "b", "p2": 1, "p3": 0},
+            {"p1": "b", "p2": 1, "p3": 0},
+            {"p1": "b", "p2": 1, "p3": 0},
+        ]
+    )
+
+    by = "p1"
+    key = list(data.groupby(by))[0][0]
+    result = {"by": by, "key": key, "type(key)": type(key)}
+    print(f"result = {ub.repr2(result, nl=1)}")
+    assert not ub.iterable(
+        key
+    ), "`by` is specified as a scalar, so getting `key` as a scalar makes sense"
+
+    by = ["p1"]
+    key = list(data.groupby(by))[0][0]
+    result = {"by": by, "key": key, "type(key)": type(key)}
+    print(f"result = {ub.repr2(result, nl=1)}")
+    assert not ub.iterable(key), (
+        "`by` is specified as a list of scalars (with one element), but we "
+        "still get `key` as a scalar. This does not make sense"
+    )
+
+    by = ["p1", "p2"]
+    key = list(data.groupby(by))[0][0]
+    result = {"by": by, "key": key, "type(key)": type(key)}
+    print(f"result = {ub.repr2(result, nl=1)}")
+    assert ub.iterable(key), (
+        "`by` is specified as a list of scalars (with multiple elements), "
+        "and we still get `key` as a tuple of values. This makes sense"
+    )
+
+
+def aggregate_stats(data, suffix="", group_keys=None):
+    """
+    Given columns interpreted as containing stats, aggregate those stats
+    within each group. For each row, any non-group, non-stat column
+    with consistent values across that columns in the group is kept as-is,
+    otherwise the new column for that row is set to None.
+
+    Args:
+        data (DataFrame):
+            a data frame with columns: 'mean', 'std', 'min', 'max', and 'nobs'
+            (possibly with a suffix)
+
+        suffix (str):
+            if the nobs, std, mean, min, and max have a suffix, specify it
+
+        group_keys (List[str]):
+            pass
+
+    Returns:
+        DataFrame:
+            New dataframe where grouped rows have been aggregated into a single
+            row.
+
+    Example:
+        >>> import sys, ubelt
+        >>> sys.path.append(ubelt.expandpath('~/code/ultrajson'))
+        >>> from json_benchmarks.benchmarker.util_stats import *  # NOQA
+        >>> import pandas as pd
+        >>> data = pd.DataFrame([
+        >>>     #
+        >>>     {'mean': 8, 'std': 1, 'min': 0, 'max': 1, 'nobs': 2, 'p1': 'a', 'p2': 1},
+        >>>     {'mean': 6, 'std': 2, 'min': 0, 'max': 1, 'nobs': 3, 'p1': 'a', 'p2': 1},
+        >>>     {'mean': 7, 'std': 3, 'min': 0, 'max': 2, 'nobs': 5, 'p1': 'a', 'p2': 2},
+        >>>     {'mean': 5, 'std': 4, 'min': 0, 'max': 3, 'nobs': 7, 'p1': 'a', 'p2': 1},
+        >>>     #
+        >>>     {'mean': 3, 'std': 1, 'min': 0, 'max': 20, 'nobs': 6, 'p1': 'b', 'p2': 1},
+        >>>     {'mean': 0, 'std': 2, 'min': 0, 'max': 20, 'nobs': 26, 'p1': 'b', 'p2': 2},
+        >>>     {'mean': 9, 'std': 3, 'min': 0, 'max': 20, 'nobs': 496, 'p1': 'b', 'p2': 1},
+        >>>     #
+        >>>     {'mean': 5, 'std': 0, 'min': 0, 'max': 1, 'nobs': 2, 'p1': 'c', 'p2': 2},
+        >>>     {'mean': 5, 'std': 0, 'min': 0, 'max': 1, 'nobs': 7, 'p1': 'c', 'p2': 2},
+        >>>     #
+        >>>     {'mean': 5, 'std': 2, 'min': 0, 'max': 2, 'nobs': 7, 'p1': 'd', 'p2': 2},
+        >>>     #
+        >>>     {'mean': 5, 'std': 2, 'min': 0, 'max': 2, 'nobs': 7, 'p1': 'e', 'p2': 1},
+        >>> ])
+        >>> print(data)
+        >>> new_data = aggregate_stats(data)
+        >>> print(new_data)
+        >>> new_data1 = aggregate_stats(data, group_keys=['p1'])
+        >>> print(new_data1)
+        >>> new_data2 = aggregate_stats(data, group_keys=['p2'])
+        >>> print(new_data2)
+    """
+    import pandas as pd
+
+    # Stats groupings
+    raw_stats_cols = ["nobs", "std", "mean", "max", "min"]
+    stats_cols = [c + suffix for c in raw_stats_cols]
+    mapper = dict(zip(stats_cols, raw_stats_cols))
+    unmapper = dict(zip(raw_stats_cols, stats_cols))
+    non_stats_cols = list(ub.oset(data.columns) - stats_cols)
+    if group_keys is None:
+        group_keys = non_stats_cols
+    non_group_keys = list(ub.oset(non_stats_cols) - group_keys)
+
+    new_rows = []
+    for group_vals, group in list(data.groupby(group_keys)):
+        # hack, is this a pandas bug in 1.4.1? Is it fixed? (Not in 1.4.2)
+        if isinstance(group_keys, list) and len(group_keys) == 1:
+            # For some reason, when we specify group keys as a list of one
+            # element, we get a squeezed value out
+            group_vals = (group_vals,)
+        stat_data = group[stats_cols].rename(mapper, axis=1)
+        new_stats = combine_stats_arrs(stat_data)
+        new_time_stats = ub.map_keys(unmapper, new_stats)
+        new_row = ub.dzip(group_keys, group_vals)
+        if non_group_keys:
+            for k in non_group_keys:
+                unique_vals = group[k].unique()
+                if len(unique_vals) == 1:
+                    new_row[k] = unique_vals[0]
+                else:
+                    new_row[k] = None
+        new_row.update(new_time_stats)
+        new_rows.append(new_row)
+    new_data = pd.DataFrame(new_rows)
+    return new_data
+
+
+def stats_dict(data, suffix=""):
+    stats = {
+        "nobs" + suffix: len(data),
+        "mean" + suffix: data.mean(),
+        "std" + suffix: data.std(),
+        "min" + suffix: data.min(),
+        "max" + suffix: data.max(),
+    }
+    return stats
+
+
+def combine_stats(s1, s2):
+    """
+    Helper for combining mean and standard deviation of multiple measurements
+
+    Args:
+        s1 (dict): stats dict containing mean, std, and n
+        s2 (dict): stats dict containing mean, std, and n
+
+    Example:
+        >>> basis = {
+        >>>     'nobs1': [1, 10, 100, 10000],
+        >>>     'nobs2': [1, 10, 100, 10000],
+        >>> }
+        >>> for params in ub.named_product(basis):
+        >>>     data1 = np.random.rand(params['nobs1'])
+        >>>     data2 = np.random.rand(params['nobs2'])
+        >>>     data3 = np.hstack([data1, data2])
+        >>>     s1 = stats_dict(data1)
+        >>>     s2 = stats_dict(data2)
+        >>>     s3 = stats_dict(data3)
+        >>>     # Check that our combo works
+        >>>     combo_s3 = combine_stats(s1, s2)
+        >>>     compare = pd.DataFrame({'raw': s3, 'combo': combo_s3})
+        >>>     print(compare)
+        >>>     assert np.allclose(compare.raw, compare.combo)
+
+    References:
+        .. [SO7753002] https://stackoverflow.com/questions/7753002/adding-combining-standard-deviations
+        .. [SO2971315] https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups
+    """
+    stats = [s1, s2]
+    data = {
+        "nobs": np.array([s["nobs"] for s in stats]),
+        "mean": np.array([s["mean"] for s in stats]),
+        "std": np.array([s["std"] for s in stats]),
+        "min": np.array([s["min"] for s in stats]),
+        "max": np.array([s["max"] for s in stats]),
+    }
+    return combine_stats_arrs(data)
+
+
+def combine_stats_arrs(data):
+    sizes = data["nobs"]
+    means = data["mean"]
+    stds = data["std"]
+    mins = data["min"]
+    maxs = data["max"]
+    varis = stds * stds
+
+    # TODO: ddof
+    # https://github.com/Erotemic/misc/blob/28cf797b9b0f8bd82e3ebee2f6d0a688ecee2838/learn/stats.py#L128
+
+    combo_size = sizes.sum()
+    combo_mean = (sizes * means).sum() / combo_size
+
+    mean_deltas = means - combo_mean
+
+    sv = (sizes * varis).sum()
+    sm = (sizes * (mean_deltas * mean_deltas)).sum()
+    combo_vars = (sv + sm) / combo_size
+    combo_std = np.sqrt(combo_vars)
+
+    combo_stats = {
+        "nobs": combo_size,
+        "mean": combo_mean,
+        "std": combo_std,
+        "min": mins.min(),
+        "max": maxs.max(),
+    }
+    return combo_stats
--- a/json_benchmarks/benchmarker/visualize.py
+++ b/json_benchmarks/benchmarker/visualize.py
@ -0,0 +1,119 @@
+import pandas as pd
+import ubelt as ub
+
+
+def benchmark_analysis(
+    rows,
+    xlabel,
+    group_labels,
+    basis,
+):
+    # xlabel = "size"
+    # Set these to empty lists if they are not used
+    # group_labels = {
+    #     "col": ["input"],
+    #     "hue": ["impl"],
+    #     "size": [],
+    # }
+    # group_keys = {}
+    # for gname, labels in group_labels.items():
+    #     group_keys[gname + "_key"] = ub.repr2(
+    #         ub.dict_isect(params, labels), compact=1, si=1
+    #     )
+    # key = ub.repr2(params, compact=1, si=1)
+
+    from process_tracker.result_analysis import SkillTracker
+
+    RECORD_ALL = 0
+
+    USE_OPENSKILL = True
+
+    RECORD_ALL = 0
+    metric_key = "time" if RECORD_ALL else "min"
+
+    # The rows define a long-form pandas data array.
+    # Data in long-form makes it very easy to use seaborn.
+    data = pd.DataFrame(rows)
+    data = data.sort_values(metric_key)
+
+    if RECORD_ALL:
+        # Show the min / mean if we record all
+        min_times = data.groupby("key").min().rename({"time": "min"}, axis=1)
+        mean_times = (
+            data.groupby("key")[["time"]].mean().rename({"time": "mean"}, axis=1)
+        )
+        stats_data = pd.concat([min_times, mean_times], axis=1)
+        stats_data = stats_data.sort_values("min")
+    else:
+        stats_data = data
+
+    if USE_OPENSKILL:
+        # Track the "skill" of each method
+        # The idea is that each setting of parameters is a game, and each
+        # "impl" is a player. We rank the players by which is fastest, and
+        # update their ranking according to the Weng-Lin Bayes ranking model.
+        # This does not take the fact that some "games" (i.e.  parameter
+        # settings) are more important than others, but it should be fairly
+        # robust on average.
+        skillboard = SkillTracker(basis["impl"])
+
+    other_keys = sorted(
+        set(stats_data.columns)
+        - {"key", "impl", "min", "mean", "hue_key", "size_key", "style_key"}
+    )
+    for params, variants in stats_data.groupby(other_keys):
+        variants = variants.sort_values("mean")
+        ranking = variants["impl"].reset_index(drop=True)
+
+        mean_speedup = variants["mean"].max() / variants["mean"]
+        stats_data.loc[mean_speedup.index, "mean_speedup"] = mean_speedup
+        min_speedup = variants["min"].max() / variants["min"]
+        stats_data.loc[min_speedup.index, "min_speedup"] = min_speedup
+
+        if USE_OPENSKILL:
+            skillboard.observe(ranking)
+
+    print("Statistics:")
+    print(stats_data)
+
+    if USE_OPENSKILL:
+        win_probs = skillboard.predict_win()
+        win_probs = ub.sorted_vals(win_probs, reverse=True)
+        print(
+            "Aggregated Rankings = {}".format(
+                ub.repr2(win_probs, nl=1, precision=4, align=":")
+            )
+        )
+
+    plot = True
+    if plot:
+        # import seaborn as sns
+        # kwplot autosns works well for IPython and script execution.
+        # not sure about notebooks.
+        import seaborn as sns
+
+        sns.set()
+        from matplotlib import pyplot as plt
+
+        plotkw = {}
+        for gname, labels in group_labels.items():
+            if labels:
+                plotkw[gname] = gname + "_key"
+
+        # Your variables may change
+        # ax = plt.figure().gca()
+        col = plotkw.pop("col")
+        facet = sns.FacetGrid(data, col=col, sharex=False, sharey=False)
+        facet.map_dataframe(sns.lineplot, x=xlabel, y=metric_key, marker="o", **plotkw)
+        facet.add_legend()
+        # sns.lineplot(data=data, )
+        # ax.set_title('JSON Benchmarks')
+        # ax.set_xlabel('Size')
+        # ax.set_ylabel('Time')
+        # ax.set_xscale('log')
+        # ax.set_yscale('log')
+
+        try:
+            __IPYTHON__
+        except NameError:
+            plt.show()
--- a/json_benchmarks/core.py
+++ b/json_benchmarks/core.py
@ -0,0 +1,82 @@
+"""
+Main definition of the benchmarks
+"""
+import scriptconfig as scfg
+import ubelt as ub
+
+from json_benchmarks import analysis, measures
+
+
+class CoreConfig(scfg.Config):
+    """
+    Benchmark JSON implementations
+    """
+
+    default = {
+        "mode": scfg.Value(
+            "all",
+            position=1,
+            choices=["all", "single", "run", "analyze"],
+            help=ub.paragraph(
+                """
+                By default all benchmarks are run, saved, and aggregated
+                with any other existing benchmarks for analysis and
+                visualization.
+
+                In "single" mode, other existing benchmarks are ignord.
+
+                In "run" mode, the benchmarks are run, but no analysis is done.
+
+                In "analyze" mode, no benchmarks are run, but any existing
+                benchmarks are loaded for analysis and visualization.
+                """
+            ),
+        ),
+        "cache_dir": scfg.Value(
+            None,
+            help=ub.paragraph(
+                """
+                Location for benchmark cache.
+                Defaults to $XDG_CACHE/ujson/benchmark_results/
+                """
+            ),
+        ),
+    }
+
+    def normalize(self):
+        dpath = self["cache_dir"]
+        if dpath is None:
+            dpath = ub.Path.appdir("ujson/benchmark_results")
+        dpath = ub.Path(dpath)
+        self["cache_dir"] = dpath
+
+
+def main(cmdline=True, **kwargs):
+    """
+    Example:
+        >>> import sys, ubelt
+        >>> sys.path.append(ubelt.expandpath('~/code/ultrajson'))
+        >>> from json_benchmarks.core import *  # NOQA
+        >>> import kwplot
+        >>> kwplot.autosns()
+        >>> cmdline = False
+        >>> kwargs = {}
+        >>> main(cmdline, **kwargs)
+    """
+    config = CoreConfig(cmdline=cmdline, data=kwargs)
+    dpath = config["cache_dir"]
+    print(f"dpath={dpath}")
+
+    run = config["mode"] in {"all", "single", "run"}
+    if run:
+        result_fpath = measures.benchmark_json()
+        print(f"result_fpath = {result_fpath!r}")
+        result_fpaths = [result_fpath]
+
+    agg = config["mode"] not in {"single"}
+    if agg:
+        result_fpaths = list(dpath.glob("benchmarks*.json"))
+
+    analyze = config["mode"] in {"all", "single", "analyze"}
+    if analyze:
+        analysis.analyze_results(result_fpaths)
--- a/json_benchmarks/datagen.py
+++ b/json_benchmarks/datagen.py
@ -0,0 +1,120 @@
+import random
+import sys
+
+import ubelt as ub
+
+
+def json_test_data_generators():
+    """
+    Generates data for benchmarks with various sizes
+
+    Returns:
+        Dict[str, callable]:
+            a mapping from test data name to its generator
+
+    Example:
+        >>> data_lut = json_test_data_generators()
+        >>> size = 2
+        >>> keys = sorted(set(data_lut) - {'Complex object'})
+        >>> for key in keys:
+        >>>     func = data_lut[key]
+        >>>     test_object = func(size)
+        >>>     print('key = {!r}'.format(key))
+        >>>     print('test_object = {!r}'.format(test_object))
+    """
+    data_lut = {}
+
+    def _register_data(name):
+        def _wrap(func):
+            data_lut[name] = func
+
+        return _wrap
+
+    # seed if desired
+    # rng = random.Random(0)
+    rng = random
+
+    @_register_data("Array with doubles")
+    def array_with_doubles(size):
+        test_object = [sys.maxsize * rng.random() for _ in range(size)]
+        return test_object
+
+    @_register_data("Array with UTF-8 strings")
+    def array_with_utf8_strings(size):
+        utf8_string = (
+            "نظام الحكم سلطاني وراثي "
+            "في الذكور من ذرية السيد تركي بن سعيد بن سلطان ويشترط فيمن يختار لولاية"
+            " الحكم من بينهم ان يكون مسلما رشيدا عاقلا ًوابنا شرعيا لابوين عمانيين "
+        )
+        test_object = [utf8_string for _ in range(size)]
+        return test_object
+
+    @_register_data("Medium complex object")
+    def medium_complex_object(size):
+        user = {
+            "userId": 3381293,
+            "age": 213,
+            "username": "johndoe",
+            "fullname": "John Doe the Second",
+            "isAuthorized": True,
+            "liked": 31231.31231202,
+            "approval": 31.1471,
+            "jobs": [1, 2],
+            "currJob": None,
+        }
+        friends = [user, user, user, user, user, user, user, user]
+        test_object = [[user, friends] for _ in range(size)]
+        return test_object
+
+    @_register_data("Array with True values")
+    def true_values(size):
+        test_object = [True for _ in range(size)]
+        return test_object
+
+    @_register_data("Array of Dict[str, int]")
+    def array_of_dict_string_int(size):
+        test_object = [
+            {str(rng.random() * 20): int(rng.random() * 1000000)} for _ in range(size)
+        ]
+        return test_object
+
+    @_register_data("Dict of List[Dict[str, int]]")
+    def dict_of_list_dict_str_int(size):
+        keys = set()
+        while len(keys) < size:
+            key = str(rng.random() * 20)
+            keys.add(key)
+        test_object = {
+            key: [
+                {str(rng.random() * 20): int(rng.random() * 1000000)}
+                for _ in range(256)
+            ]
+            for key in keys
+        }
+        return test_object
+
+    @_register_data("Complex object")
+    def complex_object(size):
+        import json
+
+        # TODO: might be better to reigster this file with setup.py or
+        # download it via some mechanism
+        try:
+            dpath = ub.Path(__file__).parent
+            fpath = dpath / "sample.json"
+            if not fpath.exists():
+                raise Exception
+        except Exception:
+            import ujson
+
+            dpath = ub.Path(ujson.__file__).parent / "tests"
+            fpath = dpath / "sample.json"
+            if not fpath.exists():
+                raise Exception
+        with open(fpath) as f:
+            test_object = json.load(f)
+        if size is not None:
+            test_object = [test_object] * size
+        return test_object
+
+    return data_lut
--- a/json_benchmarks/libraries.py
+++ b/json_benchmarks/libraries.py
@ -0,0 +1,99 @@
+"""
+Define the json libraries we are considering
+"""
+
+KNOWN_LIBRARIES = [
+    {"modname": "ujson", "distname": "ujson"},
+    # {"modname": "nujson", "distname": "nujson"},
+    # {"modname": "orjson", "distname": "orjson"},
+    # {"modname": "simplejson", "distname": "simplejson"},
+    {"modname": "json", "distname": "<stdlib>"},
+    # {"modname": "simdjson", "distname": "pysimdjson"},
+    # {"modname": "cysimdjson", "distname": "cysimdjson"},
+    # {"modname": "libpy_simdjson", "distname": "libpy-simdjson"},
+]
+
+KNOWN_MODNAMES = [info["modname"] for info in KNOWN_LIBRARIES]
+
+
+# TODO:
+# def distname_to_modnames(distname):
+#     # TODO: nice way to switch between a module's import name and it's distribution name
+#     # References:
+#     # https://stackoverflow.com/questions/49764802/get-module-name-programmatically-with-only-pypi-package-name/49764960#49764960
+#     import distlib.database
+#     distlib.database.DistributionPath().get_distribution(distname)
+#     # import importlib.metadata
+#     # importlib.metadata.metadata(distname)
+#     # importlib.util.find_spec(modname)
+#     # import simdjson
+#     # import pkg_resources
+#     # pkg_resources.get_distribution('pysimdjson')
+
+
+class Compatability:
+    """
+    Expose a common API for all tested implmentations
+    """
+
+    @staticmethod
+    def lut_dumps(module):
+        if module.__name__ == "cysimdjson":
+            return None
+        elif module.__name__ == "simdjson":
+            return None
+        else:
+            return getattr(module, "dumps", None)
+
+    @staticmethod
+    def lut_loads(module):
+        if module.__name__ == "cysimdjson":
+            parser = module.JSONParser()
+            return parser.loads
+        else:
+            return getattr(module, "loads", None)
+
+
+def available_json_impls():
+    """
+    Return a dictionary of information about each json implementation
+
+    Example:
+        >>> from json_benchmarks.libraries import *  # NOQA
+        >>> json_impls = available_json_impls()
+        >>> print('json_impls = {}'.format(ub.repr2(json_impls, nl=1)))
+    """
+    import importlib
+
+    import pkg_resources
+
+    known_libinfo = KNOWN_LIBRARIES
+    json_impls = {}
+    for libinfo in known_libinfo:
+        modname = libinfo["modname"]
+        distname = libinfo["distname"]
+        try:
+            module = importlib.import_module(modname)
+        except ImportError:
+            pass
+        else:
+            mod_version = getattr(module, "__version__", None)
+            if distname == "<stdlib>":
+                pkg_version = mod_version
+            else:
+                pkg_version = pkg_resources.get_distribution(distname).version
+            if mod_version is not None:
+                assert mod_version == pkg_version
+            version = pkg_version
+            dumps = Compatability.lut_dumps(module)
+            loads = Compatability.lut_loads(module)
+            impl_info = {
+                "module": module,
+                "modname": modname,
+                "distname": distname,
+                "version": version,
+                "dumps": dumps,
+                "loads": loads,
+            }
+            json_impls[modname] = impl_info
+    return json_impls
--- a/json_benchmarks/measures.py
+++ b/json_benchmarks/measures.py
@ -0,0 +1,132 @@
+"""
+The definitions of the measurements we want to take
+"""
+import json
+
+import scriptconfig as scfg
+import ubelt as ub
+
+from json_benchmarks import libraries
+
+
+class MeasurementConfig(scfg.Config):
+    default = {
+        "disable": scfg.Value(
+            [],
+            choices=libraries.KNOWN_MODNAMES,
+            help=ub.paragraph(
+                """
+                Remove specified libraries from the benchmarks
+                """
+            ),
+        ),
+        "factor": scfg.Value(
+            1.0,
+            help=ub.paragraph(
+                """
+                Specify as a fraction to speed up benchmarks for development /
+                testing
+                """
+            ),
+        ),
+        "cache_dir": scfg.Value(
+            None,
+            help=ub.paragraph(
+                """
+                Location for benchmark cache.
+                Defaults to $XDG_CACHE/ujson/benchmark_results/
+                """
+            ),
+        ),
+    }
+
+    def normalize(self):
+        dpath = self["cache_dir"]
+        if dpath is None:
+            dpath = ub.Path.appdir("ujson/benchmark_results")
+        dpath = ub.Path(dpath)
+        self["cache_dir"] = dpath
+
+
+def benchmark_json():
+    from json_benchmarks import benchmarker, datagen, libraries
+
+    json_impls = libraries.available_json_impls()
+    data_lut = datagen.json_test_data_generators()
+
+    # These are the parameters that we benchmark over
+    common_basis = {
+        "impl": list(json_impls.keys()),
+        "func": ["dumps", "loads"],
+    }
+    sized_basis = {
+        "input": [
+            "Array with doubles",
+            "Array with UTF-8 strings",
+            # 'Medium complex object',
+            "Array with True values",
+            "Array of Dict[str, int]",
+            # 'Dict of List[Dict[str, int]]',
+            # 'Complex object'
+        ],
+        "size": [1, 2, 4, 8, 16, 32, 128, 256, 512, 1024, 2048, 4096, 8192],
+    }
+    predefined_basis = {
+        "input": ["Complex object"],
+        "size": [None],
+    }
+
+    basis = [
+        ub.dict_union(common_basis, predefined_basis),
+        ub.dict_union(common_basis, sized_basis),
+    ]
+
+    # The Benchmarker class is a new experimental API around timerit to
+    # abstract away the details of timing a process over a grid of parameters,
+    # serializing the results, and aggregating results from disparate runs.
+    benchmark = benchmarker.Benchmarker(
+        name="bench_json",
+        num=100,
+        bestof=10,
+        verbose=3,
+        basis=basis,
+    )
+
+    def is_blocked(params):
+        if params["input"] == "Complex object":
+            # Some libraries can't handle the complex object
+            if params["impl"] in {"orjson", "libpy_simdjson"}:
+                return True
+
+    # For each variation of your experiment, create a row.
+    for params in benchmark.iter_params():
+        if is_blocked(params):
+            continue
+        # Make any modifications you need to compute input kwargs for each
+        # method here.
+        impl_info = json_impls[params["impl"]]
+        params["impl_version"] = impl_info["version"]
+        method = impl_info[params["func"]]
+        if method is None:
+            # Not all libraries implement all methods
+            continue
+        py_data = data_lut[params["input"]](params["size"])
+        if params["func"] == "dumps":
+            data = py_data
+        elif params["func"] == "loads":
+            data = json.dumps(py_data)
+        # Timerit will run some user-specified number of loops.
+        # and compute time stats with similar methodology to timeit
+        try:
+            for timer in benchmark.measure():
+                # Put any setup logic you dont want to time here.
+                # ...
+                with timer:
+                    # Put the logic you want to time here
+                    method(data)
+        except Exception as ex:
+            print(f"Failed to time: ex={ex}. Skipping")
+
+    dpath = ub.Path.appdir("ujson/benchmark_results").ensuredir()
+    result_fpath = benchmark.dump_in_dpath(dpath)
+    return result_fpath