1
0
Fork 0
mirror of https://github.com/ultrajson/ultrajson.git synced 2024-05-26 16:16:14 +02:00
ultrajson/json_benchmarks/benchmarker/util_stats.py
2024-01-17 12:12:43 -05:00

236 lines
8.1 KiB
Python

import numpy as np
import ubelt as ub
def __tabulate_issue():
# MWE for tabulate issue
# The decimals are not aligned when using "," in the floatfmt
import tabulate
data = [
[
13213.2,
3213254.23,
432432.231,
],
[432432.0, 432.3, 3.2],
]
print(tabulate.tabulate(data, headers=["a", "b"], floatfmt=",.02f"))
print(tabulate.tabulate(data, headers=["a", "b"], floatfmt=".02f"))
def __groupby_issue():
# MWE of an issue with pandas groupby
import pandas as pd
data = pd.DataFrame(
[
{"p1": "a", "p2": 1, "p3": 0},
{"p1": "a", "p2": 1, "p3": 0},
{"p1": "a", "p2": 2, "p3": 0},
{"p1": "b", "p2": 2, "p3": 0},
{"p1": "b", "p2": 1, "p3": 0},
{"p1": "b", "p2": 1, "p3": 0},
{"p1": "b", "p2": 1, "p3": 0},
]
)
by = "p1"
key = list(data.groupby(by))[0][0]
result = {"by": by, "key": key, "type(key)": type(key)}
print(f"result = {ub.repr2(result, nl=1)}")
assert not ub.iterable(
key
), "`by` is specified as a scalar, so getting `key` as a scalar makes sense"
by = ["p1"]
key = list(data.groupby(by))[0][0]
result = {"by": by, "key": key, "type(key)": type(key)}
print(f"result = {ub.repr2(result, nl=1)}")
assert not ub.iterable(key), (
"`by` is specified as a list of scalars (with one element), but we "
"still get `key` as a scalar. This does not make sense"
)
by = ["p1", "p2"]
key = list(data.groupby(by))[0][0]
result = {"by": by, "key": key, "type(key)": type(key)}
print(f"result = {ub.repr2(result, nl=1)}")
assert ub.iterable(key), (
"`by` is specified as a list of scalars (with multiple elements), "
"and we still get `key` as a tuple of values. This makes sense"
)
def aggregate_stats(data, suffix="", group_keys=None):
"""
Given columns interpreted as containing stats, aggregate those stats
within each group. For each row, any non-group, non-stat column
with consistent values across that columns in the group is kept as-is,
otherwise the new column for that row is set to None.
Args:
data (DataFrame):
a data frame with columns: 'mean', 'std', 'min', 'max', and 'nobs'
(possibly with a suffix)
suffix (str):
if the nobs, std, mean, min, and max have a suffix, specify it
group_keys (List[str]):
pass
Returns:
DataFrame:
New dataframe where grouped rows have been aggregated into a single
row.
Example:
>>> import sys, ubelt
>>> sys.path.append(ubelt.expandpath('~/code/ultrajson'))
>>> from json_benchmarks.benchmarker.util_stats import * # NOQA
>>> import pandas as pd
>>> data = pd.DataFrame([
>>> #
>>> {'mean': 8, 'std': 1, 'min': 0, 'max': 1, 'nobs': 2, 'p1': 'a', 'p2': 1},
>>> {'mean': 6, 'std': 2, 'min': 0, 'max': 1, 'nobs': 3, 'p1': 'a', 'p2': 1},
>>> {'mean': 7, 'std': 3, 'min': 0, 'max': 2, 'nobs': 5, 'p1': 'a', 'p2': 2},
>>> {'mean': 5, 'std': 4, 'min': 0, 'max': 3, 'nobs': 7, 'p1': 'a', 'p2': 1},
>>> #
>>> {'mean': 3, 'std': 1, 'min': 0, 'max': 20, 'nobs': 6, 'p1': 'b', 'p2': 1},
>>> {'mean': 0, 'std': 2, 'min': 0, 'max': 20, 'nobs': 26, 'p1': 'b', 'p2': 2},
>>> {'mean': 9, 'std': 3, 'min': 0, 'max': 20, 'nobs': 496, 'p1': 'b', 'p2': 1},
>>> #
>>> {'mean': 5, 'std': 0, 'min': 0, 'max': 1, 'nobs': 2, 'p1': 'c', 'p2': 2},
>>> {'mean': 5, 'std': 0, 'min': 0, 'max': 1, 'nobs': 7, 'p1': 'c', 'p2': 2},
>>> #
>>> {'mean': 5, 'std': 2, 'min': 0, 'max': 2, 'nobs': 7, 'p1': 'd', 'p2': 2},
>>> #
>>> {'mean': 5, 'std': 2, 'min': 0, 'max': 2, 'nobs': 7, 'p1': 'e', 'p2': 1},
>>> ])
>>> print(data)
>>> new_data = aggregate_stats(data)
>>> print(new_data)
>>> new_data1 = aggregate_stats(data, group_keys=['p1'])
>>> print(new_data1)
>>> new_data2 = aggregate_stats(data, group_keys=['p2'])
>>> print(new_data2)
"""
import pandas as pd
# Stats groupings
raw_stats_cols = ["nobs", "std", "mean", "max", "min"]
stats_cols = [c + suffix for c in raw_stats_cols]
mapper = dict(zip(stats_cols, raw_stats_cols))
unmapper = dict(zip(raw_stats_cols, stats_cols))
non_stats_cols = list(ub.oset(data.columns) - stats_cols)
if group_keys is None:
group_keys = non_stats_cols
non_group_keys = list(ub.oset(non_stats_cols) - group_keys)
new_rows = []
for group_vals, group in list(data.groupby(group_keys)):
# hack, is this a pandas bug in 1.4.1? Is it fixed? (Not in 1.4.2)
if isinstance(group_keys, list) and len(group_keys) == 1:
# For some reason, when we specify group keys as a list of one
# element, we get a squeezed value out
group_vals = (group_vals,)
stat_data = group[stats_cols].rename(mapper, axis=1)
new_stats = combine_stats_arrs(stat_data)
new_time_stats = ub.map_keys(unmapper, new_stats)
new_row = ub.dzip(group_keys, group_vals)
if non_group_keys:
for k in non_group_keys:
unique_vals = group[k].unique()
if len(unique_vals) == 1:
new_row[k] = unique_vals[0]
else:
new_row[k] = None
new_row.update(new_time_stats)
new_rows.append(new_row)
new_data = pd.DataFrame(new_rows)
return new_data
def stats_dict(data, suffix=""):
stats = {
"nobs" + suffix: len(data),
"mean" + suffix: data.mean(),
"std" + suffix: data.std(),
"min" + suffix: data.min(),
"max" + suffix: data.max(),
}
return stats
def combine_stats(s1, s2):
"""
Helper for combining mean and standard deviation of multiple measurements
Args:
s1 (dict): stats dict containing mean, std, and n
s2 (dict): stats dict containing mean, std, and n
Example:
>>> basis = {
>>> 'nobs1': [1, 10, 100, 10000],
>>> 'nobs2': [1, 10, 100, 10000],
>>> }
>>> for params in ub.named_product(basis):
>>> data1 = np.random.rand(params['nobs1'])
>>> data2 = np.random.rand(params['nobs2'])
>>> data3 = np.hstack([data1, data2])
>>> s1 = stats_dict(data1)
>>> s2 = stats_dict(data2)
>>> s3 = stats_dict(data3)
>>> # Check that our combo works
>>> combo_s3 = combine_stats(s1, s2)
>>> compare = pd.DataFrame({'raw': s3, 'combo': combo_s3})
>>> print(compare)
>>> assert np.allclose(compare.raw, compare.combo)
References:
.. [SO7753002] https://stackoverflow.com/questions/7753002/adding-combining-standard-deviations
.. [SO2971315] https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups
"""
stats = [s1, s2]
data = {
"nobs": np.array([s["nobs"] for s in stats]),
"mean": np.array([s["mean"] for s in stats]),
"std": np.array([s["std"] for s in stats]),
"min": np.array([s["min"] for s in stats]),
"max": np.array([s["max"] for s in stats]),
}
return combine_stats_arrs(data)
def combine_stats_arrs(data):
sizes = data["nobs"]
means = data["mean"]
stds = data["std"]
mins = data["min"]
maxs = data["max"]
varis = stds * stds
# TODO: ddof
# https://github.com/Erotemic/misc/blob/28cf797b9b0f8bd82e3ebee2f6d0a688ecee2838/learn/stats.py#L128
combo_size = sizes.sum()
combo_mean = (sizes * means).sum() / combo_size
mean_deltas = means - combo_mean
sv = (sizes * varis).sum()
sm = (sizes * (mean_deltas * mean_deltas)).sum()
combo_vars = (sv + sm) / combo_size
combo_std = np.sqrt(combo_vars)
combo_stats = {
"nobs": combo_size,
"mean": combo_mean,
"std": combo_std,
"min": mins.min(),
"max": maxs.max(),
}
return combo_stats