Port datasets

2024-05-24 22:36:36 +02:00 · 2022-05-26 07:36:27 -04:00 · 2022-05-26 07:36:27 -04:00 · d036df252f
parent da6428296d
commit d036df252f
4 changed files with 368 additions and 208 deletions
--- a/tests/benchmark3.py
+++ b/tests/benchmark3.py
@ -8,58 +8,136 @@ import sys
 import ubelt as ub


-def data_lut(input, size):
-    if input == "Array with UTF-8 strings":
-        test_object = []
-        for x in range(size):
-            test_object.append(
-                "نظام الحكم سلطاني وراثي "
-                "في الذكور من ذرية السيد تركي بن سعيد بن سلطان ويشترط فيمن يختار لولاية"
-                " الحكم من بينهم ان يكون مسلما رشيدا عاقلا ًوابنا شرعيا لابوين عمانيين "
-            )
+def json_test_data_generators():
+    """
+    Generates data for benchmarks with various sizes
+
+    Returns:
+        Dict[str, callable]:
+            a mapping from test data name to its generator
+
+    Example:
+        >>> data_lut = json_test_data_generators()
+        >>> size = 2
+        >>> keys = sorted(set(data_lut) - {'Complex object'})
+        >>> for key in keys:
+        >>>     func = data_lut[key]
+        >>>     test_object = func(size)
+        >>>     print('key = {!r}'.format(key))
+        >>>     print('test_object = {!r}'.format(test_object))
+    """
+    data_lut = {}
+    def _register_data(name):
+        def _wrap(func):
+            data_lut[name] = func
+        return _wrap
+
+    # seed if desired
+    #rng = random.Random()
+    rng = random
+
+    @_register_data('Array with doubles')
+    def array_with_doubles(size):
+        test_object = [sys.maxsize * rng.random() for _ in range(size)]
        return test_object
-    elif input == "Array with doubles":
-        test_object = []
-        for x in range(256):
-            test_object.append(sys.maxsize * random.random())
-    else:
-        raise KeyError(input)
+
+    @_register_data('Array with UTF-8 strings')
+    def array_with_utf8_strings(size):
+        utf8_string = (
+            "نظام الحكم سلطاني وراثي "
+            "في الذكور من ذرية السيد تركي بن سعيد بن سلطان ويشترط فيمن يختار لولاية"
+            " الحكم من بينهم ان يكون مسلما رشيدا عاقلا ًوابنا شرعيا لابوين عمانيين "
+        )
+        test_object = [utf8_string for _ in range(size)]
+        return test_object
+
+    @_register_data('Medium complex object')
+    def medium_complex_object(size):
+        user = {
+            "userId": 3381293,
+            "age": 213,
+            "username": "johndoe",
+            "fullname": "John Doe the Second",
+            "isAuthorized": True,
+            "liked": 31231.31231202,
+            "approval": 31.1471,
+            "jobs": [1, 2],
+            "currJob": None,
+        }
+        friends = [user, user, user, user, user, user, user, user]
+        test_object = [[user, friends] for _ in range(size)]
+        return test_object
+
+    @_register_data('Array with True values')
+    def true_values(size):
+        test_object = [True for _ in range(size)]
+        return test_object
+
+    @_register_data('Array of Dict[str, int]')
+    def array_of_dict_string_int(size):
+        test_object = [
+            {str(rng.random() * 20): int(rng.random() * 1000000)}
+            for _ in range(size)
+        ]
+        return test_object
+
+    @_register_data('Dict of List[Dict[str, int]]')
+    def dict_of_list_dict_str_int(size):
+        keys = set()
+        while len(keys) < size:
+            key = str(rng.random() * 20)
+            keys.add(key)
+        test_object = {
+            key: [
+                {str(rng.random() * 20): int(rng.random() * 1000000)}
+                for _ in range(256)
+            ]
+            for key in keys
+        }
+        return test_object
+
+    @_register_data('Complex object')
+    def complex_object(size):
+        import json
+        # TODO: might be better to reigster this file with setup.py or
+        # download it via some mechanism
+        try:
+            dpath = ub.Path(__file__).parent
+            fpath = dpath / 'sample.json'
+            if not fpath.exists():
+                raise Exception
+        except Exception:
+            import ujson
+            dpath = ub.Path(ujson.__file__).parent / 'tests'
+            fpath = dpath / 'sample.json'
+            if not fpath.exists():
+                raise Exception
+        with open(fpath, 'r') as f:
+            test_object = json.load(f)
+        if size > 1:
+            test_object = [test_object] * size
+        return test_object
+
+    return data_lut


 def available_json_impls():
-    JSON_IMPLS = {}
-
-    try:
-        import json
-        JSON_IMPLS["json"] = json
-    except ImportError:
-        pass
-
-    try:
-        import ujson
-        JSON_IMPLS["ujson"] = ujson
-    except ImportError:
-        pass
-
-    try:
-        import nujson
-        JSON_IMPLS["nujson"] = nujson
-    except ImportError:
-        pass
-
-    try:
-        import orjson
-        JSON_IMPLS["nujson"] = orjson
-    except ImportError:
-        pass
-
-    try:
-        import simplejson
-        JSON_IMPLS["simplejson"] = simplejson
-    except ImportError:
-        pass
-
-    return JSON_IMPLS
+    import importlib
+    known_modnames = [
+        'ujson', 'json', 'nujson', 'orjson', 'simplejson'
+    ]
+    json_impls = {}
+    for libname in known_modnames:
+        try:
+            module = importlib.import_module(libname)
+        except ImportError:
+            pass
+        else:
+            json_impls[libname] = {
+                'module': module,
+                'version': module.__version__,
+            }
+    return json_impls


 def benchmark_json_dumps():
@ -67,28 +145,34 @@ def benchmark_json_dumps():
    sys.path.append(ub.expandpath('~/code/ultrajson/tests'))
    from benchmarker import Benchmarker

-    JSON_IMPLS = available_json_impls()
+    json_impls = available_json_impls()
+    data_lut = json_test_data_generators()

-    version_infos = {k: v.__version__ for k, v in JSON_IMPLS.items()}
-
-    def method_lut(impl):
-        return JSON_IMPLS[impl].dumps
+    list(data_lut.keys())

    # These are the parameters that we benchmark over
    basis = {
        "input": [
-            "Array with UTF-8 strings",
-            "Array with doubles",
+            'Array with doubles',
+            'Array with UTF-8 strings',
+            # 'Medium complex object',
+            'Array with True values',
+            'Array of Dict[str, int]',
+            # 'Dict of List[Dict[str, int]]',
+            # 'Complex object'
        ],
        "size": [1, 32, 256, 1024, 2048],
-        "impl": list(JSON_IMPLS.keys()),
+        "impl": list(json_impls.keys()),
    }

+    # The Benchmarker class is a new experimental API around timerit to
+    # abstract away the details of timing a process over a grid of parameters,
+    # serializing the results, and aggregating results from disparate runs.
    benchmark = Benchmarker(
        name='bench_json_dumps',
-        # Change params here to modify number of trials
        num=100,
        bestof=10,
+        verbose=2,
        basis=basis,
    )

@ -96,11 +180,11 @@ def benchmark_json_dumps():
    for params in benchmark.iter_params():
        # Make any modifications you need to compute input kwargs for each
        # method here.
-        impl = params["impl"]
-        impl_version = version_infos[impl]
+        impl_info = json_impls[params["impl"]]
+        method = impl_info['module'].dumps
+        impl_version = impl_info['version']
        params["impl_version"] = impl_version
-        method = method_lut(impl)
-        data = data_lut(params["input"], params["size"])
+        data = data_lut[params["input"]](params["size"])
        # Timerit will run some user-specified number of loops.
        # and compute time stats with similar methodology to timeit
        for timer in benchmark.measure():
@ -114,20 +198,25 @@ def benchmark_json_dumps():
    benchmark.dump_in_dpath(dpath)

    RECORD_ALL = 0
-    metric_key = "time" if RECORD_ALL else "mean"
+    metric_key = "time" if RECORD_ALL else "mean_time"

    from benchmarker import result_analysis
    results = benchmark.result.to_result_list()
+
    analysis = result_analysis.ResultAnalysis(
        results,
        metrics=[metric_key],
        params=['impl'],
        metric_objectives={
-            'min': 'min',
-            'mean': 'min',
+            'min_time': 'min',
+            'mean_time': 'min',
            'time': 'min',
        })
    analysis.analysis()
+    analysis.table
+
+    param_group = ['impl', 'impl_version']
+    analysis.abalate(param_group)

    # benchmark_analysis(rows, xlabel, group_labels, basis, RECORD_ALL)

--- a/tests/benchmarker/_test_ttest.py
+++ b/tests/benchmarker/_test_ttest.py
@ -1,28 +0,0 @@
-
-def check_ttest():
-    import scipy
-    import scipy.stats  # NOQA
-    from benchmarker.benchmarker import stats_dict
-    import numpy as np
-    metric_vals1 = np.random.randn(10000) + 0.01
-    metric_vals2 = np.random.randn(1000)
-
-    stats1 = stats_dict(metric_vals1)
-    stats2 = stats_dict(metric_vals2)
-
-    ind_kw = dict(
-        equal_var=0,
-        # alternative='two-sided'
-        alternative='less' if stats1['mean'] < stats2['mean'] else 'greater'
-    )
-
-    # Not sure why these are slightly different
-    res1 = scipy.stats.ttest_ind(metric_vals1, metric_vals2, **ind_kw)
-
-    res2 = scipy.stats.ttest_ind_from_stats(
-        stats1['mean'], stats1['std'], stats1['n'],
-        stats2['mean'], stats2['std'], stats2['n'],
-        **ind_kw
-    )
-    print('res1 = {!r}'.format(res1))
-    print('res2 = {!r}'.format(res2))
--- a/tests/benchmarker/benchmarker.py
+++ b/tests/benchmarker/benchmarker.py
@ -8,9 +8,9 @@ from benchmarker.process_context import ProcessContext

@dataclass
 class BenchmarkerConfig:
-    name   : str = None
-    num    : int = 100
-    bestof : int = 10
+    name    : str = None
+    num     : int = 100
+    bestof  : int = 10


 class BenchmarkerResult:
@ -97,14 +97,16 @@ class Benchmarker:
        >>> dpath = ub.Path.appdir('benchmarker/demo').ensuredir()
        >>> self.dump_in_dpath(dpath)
    """
-    def __init__(self, basis={}, **kwargs):
+    def __init__(self, basis={}, verbose=1, **kwargs):
        self.basis = basis

        self.config = BenchmarkerConfig(**kwargs)

        self.ti = timerit.Timerit(
            num=self.config.num,
-            bestof=self.config.bestof)
+            bestof=self.config.bestof,
+            verbose=verbose,
+        )
        self.context = ProcessContext(name=self.config.name)
        self.rows = []
        self.RECORD_ALL = 0
@ -152,7 +154,7 @@ class Benchmarker:
                rows.append(row)
        else:
            times = np.array(ti.robust_times())
-            metrics = stats_dict(times)
+            metrics = stats_dict(times, '_time')
            row = {
                'metrics': metrics,
                'params': params,
@ -161,13 +163,13 @@ class Benchmarker:
            rows.append(row)


-def stats_dict(data):
+def stats_dict(data, suffix=''):
    stats = {
-        'n': len(data),
-        'mean': data.mean(),
-        'std': data.std(),
-        'min': data.min(),
-        'max': data.max(),
+        'nobs' + suffix: len(data),
+        'mean' + suffix: data.mean(),
+        'std' + suffix: data.std(),
+        'min' + suffix: data.min(),
+        'max' + suffix: data.max(),
    }
    return stats

@ -182,12 +184,12 @@ def combine_stats(s1, s2):

    Example:
        >>> basis = {
-        >>>     'n1': [1, 10, 100, 10000],
-        >>>     'n2': [1, 10, 100, 10000],
+        >>>     'nobs1': [1, 10, 100, 10000],
+        >>>     'nobs2': [1, 10, 100, 10000],
        >>> }
        >>> for params in ub.named_product(basis):
-        >>>     data1 = np.random.rand(params['n1'])
-        >>>     data2 = np.random.rand(params['n2'])
+        >>>     data1 = np.random.rand(params['nobs1'])
+        >>>     data2 = np.random.rand(params['nobs2'])
        >>>     data3 = np.hstack([data1, data2])
        >>>     s1 = stats_dict(data1)
        >>>     s2 = stats_dict(data2)
@ -203,7 +205,7 @@ def combine_stats(s1, s2):
        https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups
    """
    stats = [s1, s2]
-    sizes = np.array([s['n'] for s in stats])
+    sizes = np.array([s['nobs'] for s in stats])
    means = np.array([s['mean'] for s in stats])
    stds = np.array([s['std'] for s in stats])
    mins = np.array([s['min'] for s in stats])
@ -221,7 +223,7 @@ def combine_stats(s1, s2):
    combo_std = np.sqrt(combo_vars)

    combo_stats = {
-        'n': combo_size,
+        'nobs': combo_size,
        'mean': combo_mean,
        'std': combo_std,
        'min': mins.min(),
--- a/tests/benchmarker/result_analysis.py
+++ b/tests/benchmarker/result_analysis.py
@ -8,6 +8,19 @@ import scipy
 import scipy.stats  # NOQA


+# a list of common objectives
+DEFAULT_METRIC_TO_OBJECTIVE = {
+    'time': 'min',
+    'ap': 'max',
+    'acc': 'max',
+    'f1': 'max',
+    'mcc': 'max',
+    #
+    'loss': 'min',
+    'brier': 'min',
+}
+
+
 class Result(ub.NiceRepr):
    """
    Storage of names, parameters, and quality metrics for a single experiment.
@ -31,6 +44,10 @@ class Result(ub.NiceRepr):
        >>> self = Result.demo(rng=32)
        >>> print('self = {}'.format(self))
        self = <Result(name=53f57161,f1=0.33,acc=0.75,param1=1,param2=6.67,param3=a)>
+
+    Example:
+        >>> self = Result.demo(mode='alt', rng=32)
+        >>> print('self = {}'.format(self))
    """
    def __init__(self, name, params, metrics, meta=None):
        self.name = name
@ -48,21 +65,43 @@ class Result(ub.NiceRepr):
        return text

    @classmethod
-    def demo(cls, rng=None):
+    def demo(cls, mode='null', rng=None):
        import numpy as np
        import string
        import kwarray
        rng = kwarray.ensure_rng(rng)
-        demo_param_space = {
-            'param1': list(range(3)),
-            'param2': np.linspace(0, 10, 10),
-            'param3': list(string.ascii_lowercase[0:3]),
-        }
-        params = {k: rng.choice(b) for k, b in demo_param_space.items()}
-        metrics = {
-            'f1': rng.rand(),
-            'acc': rng.rand(),
-        }
+
+        if mode == 'null':
+            # The null hypothesis should generally be true here,
+            # there is no relation between the results and parameters
+            demo_param_space = {
+                'param1': list(range(3)),
+                'param2': np.linspace(0, 10, 10),
+                'param3': list(string.ascii_lowercase[0:3]),
+            }
+            params = {k: rng.choice(b) for k, b in demo_param_space.items()}
+            metrics = {
+                'f1': rng.rand(),
+                'acc': rng.rand(),
+            }
+        elif mode == 'alt':
+            # The alternative hypothesis should be true here, there is a
+            # relationship between results two of the params.
+            from scipy.special import expit
+            params = {
+                'w': rng.randint(-1, 1),
+                'x': rng.randint(-3, 3),
+                'y': rng.randint(-2, 2),
+                'z': rng.randint(-3, 3),
+            }
+            noise = np.random.randn() * 1
+            r = 3 * params['x'] + params['y'] ** 2 + 0.3 * params['z'] ** 3
+            acc = expit(r / 20 + noise)
+            metrics = {
+                'acc': acc,
+            }
+        else:
+            raise KeyError(mode)
        name = ub.hash_data(params)[0:8]
        self = cls(name, params, metrics)
        return self
@ -105,6 +144,10 @@ class ResultAnalysis(ub.NiceRepr):
        >>> self = ResultAnalysis.demo()
        >>> self.analysis()

+    Example:
+        >>> self = ResultAnalysis.demo(num=5000, mode='alt')
+        >>> self.analysis()
+
    Example:
        >>> # Given a list of experiments, configs, and results
        >>> # Create a ResultAnalysis object
@ -168,7 +211,8 @@ class ResultAnalysis(ub.NiceRepr):

    def __init__(self, results, metrics=None, params=None, ignore_params=None,
                 ignore_metrics=None, metric_objectives=None,
-                 abalation_orders={1}, default_objective='max'):
+                 abalation_orders={1}, default_objective='max',
+                 p_threshold=0.05):
        self.results = results
        if ignore_metrics is None:
            ignore_metrics = set()
@ -181,23 +225,15 @@ class ResultAnalysis(ub.NiceRepr):
        self.default_objective = default_objective

        # encode if we want to maximize or minimize a metric
-        default_metric_to_objective = {
-            'ap': 'max',
-            'acc': 'max',
-            'f1': 'max',
-            #
-            'loss': 'min',
-            'brier': 'min',
-        }
        if metric_objectives is None:
            metric_objectives = {}
-
-        self.metric_objectives = default_metric_to_objective.copy()
+        self.metric_objectives = DEFAULT_METRIC_TO_OBJECTIVE.copy()
        self.metric_objectives.update(metric_objectives)

        self.params = params
        self.metrics = metrics
        self.statistics = None
+        self.p_threshold = p_threshold

        self._description = {}
        self._description['built'] = False
@ -210,11 +246,14 @@ class ResultAnalysis(ub.NiceRepr):
        return ub.repr2(self._description, si=1, sv=1)

    @classmethod
-    def demo(cls, num=10, rng=None):
+    def demo(cls, num=10, mode='null', rng=None):
        import kwarray
        rng = kwarray.ensure_rng(rng)
-        results = [Result.demo(rng=rng) for _ in range(num)]
-        self = cls(results, metrics={'f1', 'acc'})
+        results = [Result.demo(mode=mode, rng=rng) for _ in range(num)]
+        if mode == 'null':
+            self = cls(results, metrics={'f1', 'acc'})
+        else:
+            self = cls(results, metrics={'acc'})
        return self

    def run(self):
@ -251,18 +290,30 @@ class ResultAnalysis(ub.NiceRepr):
        varied = {k: vs for k, vs in varied.items() if len(vs)}
        return varied

-    def abalation_groups(self, param):
+    def abalation_groups(self, param_group, k=2):
        """
        Return groups where the specified parameter(s) are varied, but all
        other non-ignored parameters are held the same.

+        Args:
+            param_group (str | List[str]):
+                One or more parameters that are allowed to vary
+
+            k (int):
+                minimum number of items a group must contain to be returned
+
+        Returns:
+            List[DataFrame]:
+                a list of subsets of in the table where all but the specified
+                (non-ignored) parameters are allowed to vary.
+
        Example:
            >>> self = ResultAnalysis.demo()
            >>> param = 'param2'
            >>> self.abalation_groups(param)
        """
-        if not ub.iterable(param):
-            param = [param]
+        if not ub.iterable(param_group):
+            param_group = [param_group]
        table = self.table
        config_rows = [r.params for r in self.results]
        config_keys = list(map(set, config_rows))
@ -271,14 +322,14 @@ class ResultAnalysis(ub.NiceRepr):
        if self.ignore_params:
            config_keys = [c - self.ignore_params for c in config_keys]
        isect_params = set.intersection(*config_keys)
-        other_params = sorted(isect_params - set(param))
+        other_params = sorted(isect_params - set(param_group))
        groups = []
        for key, group in table.groupby(other_params, dropna=False):
-            if len(group) > 1:
+            if len(group) >= k:
                groups.append(group)
        return groups

-    def abalate(self, param):
+    def abalate(self, param_group):
        """
        Example:
            >>> self = ResultAnalysis.demo(100)
@ -287,34 +338,34 @@ class ResultAnalysis(ub.NiceRepr):
            >>> self.abalate(param)

            >>> self = ResultAnalysis.demo()
-            >>> param = ['param2', 'param3']
-            >>> self.abalate(param)
+            >>> param_group = ['param2', 'param3']
+            >>> # xdoctest: +REQUIRES(module:openskill)
+            >>> self.abalate(param_group)
        """
-        import itertools as it
        if self.table is None:
            self.table = self.build_table()
-        if not ub.iterable(param):
-            param = [param]
+        if not ub.iterable(param_group):
+            param_group = [param_group]

        # For hashable generic dictionary
        from collections import namedtuple
-        gd = namedtuple('config', param)
+        gd = namedtuple('config', param_group)

        # from types import SimpleNamespace
-        param_unique_vals_ = self.table[param].drop_duplicates().to_dict('records')
+        param_unique_vals_ = self.table[param_group].drop_duplicates().to_dict('records')
        param_unique_vals = [gd(**d) for d in param_unique_vals_]
-        # param_unique_vals = {p: self.table[p].unique().tolist() for p in param}
+        # param_unique_vals = {p: self.table[p].unique().tolist() for p in param_group}
        score_improvements = ub.ddict(list)
        scored_obs = []
        skillboard = SkillTracker(param_unique_vals)
-        groups = self.abalation_groups(param)
+        groups = self.abalation_groups(param_group, k=2)

        for group in groups:
            for metric_key in self.metrics:
                ascending = self._objective_is_ascending(metric_key)

                group = group.sort_values(metric_key, ascending=ascending)
-                subgroups = group.groupby(param)
+                subgroups = group.groupby(param_group)
                if ascending:
                    best_idx = subgroups[metric_key].idxmax()
                else:
@ -326,19 +377,19 @@ class ResultAnalysis(ub.NiceRepr):
                    if x1 != x2:
                        r1 = best_group.loc[x1]
                        r2 = best_group.loc[x2]
-                        k1 = gd(**r1[param])
-                        k2 = gd(**r2[param])
+                        k1 = gd(**r1[param_group])
+                        k2 = gd(**r2[param_group])
                        diff = r1[metric_key] - r2[metric_key]
                        score_improvements[(k1, k2, metric_key)].append(diff)

                # metric_vals = best_group[metric_key].values
                # diffs = metric_vals[None, :] - metric_vals[:, None]
-                best_group.set_index(param)
-                # best_group[param]
+                best_group.set_index(param_group)
+                # best_group[param_group]
                # best_group[metric_key].diff()
-                scored_ranking = best_group[param + [metric_key]].reset_index(drop=True)
+                scored_ranking = best_group[param_group + [metric_key]].reset_index(drop=True)
                scored_obs.append(scored_ranking)
-                ranking = [gd(**d) for d in scored_ranking[param].to_dict('records')]
+                ranking = [gd(**d) for d in scored_ranking[param_group].to_dict('records')]
                skillboard.observe(ranking)

        print('skillboard.ratings = {}'.format(ub.repr2(skillboard.ratings, nl=1, align=':')))
@ -377,15 +428,12 @@ class ResultAnalysis(ub.NiceRepr):
            # TODO : document these stats clearly and accurately

        Example:
-            >>> self = ResultAnalysis.demo(num=30)
+            >>> self = ResultAnalysis.demo(num=100)
            >>> print(self.table)
-            >>> param_group = ['param2']
+            >>> param_group = ['param2', 'param1']
            >>> metric_key = 'f1'
            >>> stats_row = self.test_group(param_group, metric_key)
-            >>> print('stats_row = {}'.format(ub.repr2(stats_row, nl=2, precision=2)))
-            >>> # ---
-            >>> self.build()
-            >>> self.report()
+            >>> print('stats_row = {}'.format(ub.repr2(stats_row, nl=2, sort=0, precision=2)))
        """
        param_group_name = ','.join(param_group)
        stats_row = {
@ -461,10 +509,6 @@ class ResultAnalysis(ub.NiceRepr):
        pairwise_statistics = []
        for pair in value_pairs:
            pair_statistics = {}
-            # try:
-            #     param_val1, param_val2 = sorted(pair)
-            # except Exception:
-            #     param_val1, param_val2 = (pair)
            param_val1, param_val2 = pair

            metric_vals1 = value_to_metric[param_val1]
@ -477,16 +521,17 @@ class ResultAnalysis(ub.NiceRepr):
            pair_statistics['value2'] = param_val2
            pair_statistics['n1'] = len(metric_vals1)
            pair_statistics['n2'] = len(metric_vals2)
-            # TODO: probably want to use an alternative=less or greater here
-            # instead of simply unequal
-            alternative = 'two-sided'
-            if 1:
+
+            TEST_ONLY_FOR_DIFFERENCE = True
+            if TEST_ONLY_FOR_DIFFERENCE:
                if ascending:
                    # We want to minimize the metric
                    alternative = 'less' if rank1 < rank2 else 'greater'
                else:
                    # We want to maximize the metric
                    alternative = 'greater' if rank1 < rank2 else 'less'
+            else:
+                alternative = 'two-sided'

            ind_kw = dict(
                equal_var=False,
@ -499,8 +544,8 @@ class ResultAnalysis(ub.NiceRepr):
                stats1 = stats_dict(metric_vals1)
                stats2 = stats_dict(metric_vals2)
                scipy.stats.ttest_ind_from_stats(
-                    stats1['mean'], stats1['std'], stats1['n'],
-                    stats2['mean'], stats2['std'], stats2['n'],
+                    stats1['mean'], stats1['std'], stats1['nobs'],
+                    stats2['mean'], stats2['std'], stats2['nobs'],
                    **ind_kw
                )
                # metric_vals1, metric_vals2, equal_var=False)
@ -523,6 +568,8 @@ class ResultAnalysis(ub.NiceRepr):
                for nk in common:
                    group1 = nk_to_group1[nk]
                    group2 = nk_to_group2[nk]
+                    # TODO: Not sure if taking the product of everything within
+                    # the comparable group is correct or not. I think it is ok.
                    for i, j in it.product(group1.index, group2.index):
                        comparable_indexes1.append(i)
                        comparable_indexes2.append(j)
@ -590,7 +637,6 @@ class ResultAnalysis(ub.NiceRepr):
        self._description['built'] = True

    def report(self):
-        p_threshold = 0.05
        stat_groups = ub.group_items(self.statistics, key=lambda x: x['param_name'])
        stat_groups_items = list(stat_groups.items())

@ -600,43 +646,47 @@ class ResultAnalysis(ub.NiceRepr):
            'metrics': self.metrics_of_interest,
        })
        for grid_item in grid:
-            metric_key = grid_item['metrics']
-            stat_groups_item = grid_item['stat_group_item']
-
-            param_name, stat_group = stat_groups_item
-            stats_row = ub.group_items(stat_group, key=lambda x: x['metric'])[metric_key][0]
-            title = ('PARAMETER: {} - METRIC: {}'.format(param_name, metric_key))
-            print('\n\n')
-            print(title)
-            print('=' * len(title))
-            print(stats_row['moments'])
-            anova_rank_p = stats_row['anova_rank_p']
-            anova_mean_p = stats_row['anova_mean_p']
-            # Rougly speaking
-            print('')
-            print(f'ANOVA: If p is low, the param {param_name!r} might have an effect')
-            print(ub.color_text(f'  Rank-ANOVA: p={anova_rank_p:0.8f}', 'green' if anova_rank_p < p_threshold else None))
-            print(ub.color_text(f'  Mean-ANOVA: p={anova_mean_p:0.8f}', 'green' if anova_mean_p < p_threshold else None))
-            print('')
-            print('Pairwise T-Tests')
-            for pairstat in stats_row['pairwise']:
-                # Is this backwards?
-                value1 = pairstat['value1']
-                value2 = pairstat['value2']
-                winner = pairstat['winner']
-                if value2 == winner:
-                    value1, value2 = value2, value1
-                print(f'  If p is low, {param_name}={value1} may outperform {param_name}={value2}.')
-                if 'ttest_ind' in pairstat:
-                    ttest_ind_result = pairstat['ttest_ind']
-                    print(ub.color_text(f'    ttest_ind:  p={ttest_ind_result.pvalue:0.8f}', 'green' if ttest_ind_result.pvalue < p_threshold else None))
-                if 'ttest_rel' in pairstat:
-                    n_common = pairstat['n_common']
-                    ttest_rel_result = pairstat['ttest_ind']
-                    print(ub.color_text(f'    ttest_rel:  p={ttest_rel_result.pvalue:0.8f}, n={n_common}', 'green' if ttest_rel_result.pvalue < p_threshold else None))
+            self._report_one(grid_item)

        print(self.stats_table)

+    def _report_one(self, grid_item):
+        p_threshold = self.p_threshold
+        metric_key = grid_item['metrics']
+        stat_groups_item = grid_item['stat_group_item']
+
+        param_name, stat_group = stat_groups_item
+        stats_row = ub.group_items(stat_group, key=lambda x: x['metric'])[metric_key][0]
+        title = ('PARAMETER: {} - METRIC: {}'.format(param_name, metric_key))
+        print('\n\n')
+        print(title)
+        print('=' * len(title))
+        print(stats_row['moments'])
+        anova_rank_p = stats_row['anova_rank_p']
+        anova_mean_p = stats_row['anova_mean_p']
+        # Rougly speaking
+        print('')
+        print(f'ANOVA: If p is low, the param {param_name!r} might have an effect')
+        print(ub.color_text(f'  Rank-ANOVA: p={anova_rank_p:0.8f}', 'green' if anova_rank_p < p_threshold else None))
+        print(ub.color_text(f'  Mean-ANOVA: p={anova_mean_p:0.8f}', 'green' if anova_mean_p < p_threshold else None))
+        print('')
+        print('Pairwise T-Tests')
+        for pairstat in stats_row['pairwise']:
+            # Is this backwards?
+            value1 = pairstat['value1']
+            value2 = pairstat['value2']
+            winner = pairstat['winner']
+            if value2 == winner:
+                value1, value2 = value2, value1
+            print(f'  If p is low, {param_name}={value1} may outperform {param_name}={value2}.')
+            if 'ttest_ind' in pairstat:
+                ttest_ind_result = pairstat['ttest_ind']
+                print(ub.color_text(f'    ttest_ind:  p={ttest_ind_result.pvalue:0.8f}', 'green' if ttest_ind_result.pvalue < p_threshold else None))
+            if 'ttest_rel' in pairstat:
+                n_common = pairstat['n_common']
+                ttest_rel_result = pairstat['ttest_ind']
+                print(ub.color_text(f'    ttest_rel:  p={ttest_rel_result.pvalue:0.8f}, n_pairs={n_common}', 'green' if ttest_rel_result.pvalue < p_threshold else None))
+
    def conclusions(self):
        conclusions = []
        for stat in self.statistics:
@ -653,6 +703,50 @@ class ResultAnalysis(ub.NiceRepr):
                conclusions.append(txt)
        return conclusions

+    def plot(self, xlabel, metric_key, group_labels):
+        """
+        Example:
+            >>> self = ResultAnalysis.demo(num=5000, mode='alt')
+            >>> self.analysis()
+            >>> print('self = {}'.format(self))
+            >>> # xdoctest: +REQUIRES(module:kwplot)
+            >>> import kwplot
+            >>> kwplot.autompl()
+            >>> xlabel = 'x'
+            >>> metric_key = 'acc'
+            >>> group_labels = {
+            >>>     'col': ['y', 'w'],
+            >>>     'hue': ['z'],
+            >>>     'size': [],
+            >>> }
+            >>> self.plot(xlabel, metric_key, group_labels)
+        """
+        import seaborn as sns
+        sns.set()
+        from matplotlib import pyplot as plt  # NOQA
+        data = self.table
+        data = data.sort_values(metric_key)
+        for gname, labels in group_labels.items():
+            if len(labels):
+                new_col = []
+                for row in data[labels].to_dict('records'):
+                    item = ub.repr2(row, compact=1, si=1)
+                    new_col.append(item)
+                gkey = gname + "_key"
+                data[gkey] = new_col
+
+        plotkw = {}
+        for gname, labels in group_labels.items():
+            if labels:
+                plotkw[gname] = gname + "_key"
+
+        # Your variables may change
+        # ax = plt.figure().gca()
+        col = plotkw.pop("col")
+        facet = sns.FacetGrid(data, col=col, sharex=False, sharey=False)
+        facet.map_dataframe(sns.lineplot, x=xlabel, y=metric_key, marker="o", **plotkw)
+        facet.add_legend()
+

 class SkillTracker:
    """
@ -677,6 +771,9 @@ class SkillTracker:
            4: 0.20,
            5: 0.20,
        }
+
+    Requirements:
+        openskill
    """

    def __init__(self, player_ids):