mirror of
https://github.com/ultrajson/ultrajson.git
synced 2024-05-26 16:16:14 +02:00
236 lines
8.1 KiB
Python
236 lines
8.1 KiB
Python
import numpy as np
|
|
import ubelt as ub
|
|
|
|
|
|
def __tabulate_issue():
|
|
# MWE for tabulate issue
|
|
# The decimals are not aligned when using "," in the floatfmt
|
|
import tabulate
|
|
|
|
data = [
|
|
[
|
|
13213.2,
|
|
3213254.23,
|
|
432432.231,
|
|
],
|
|
[432432.0, 432.3, 3.2],
|
|
]
|
|
print(tabulate.tabulate(data, headers=["a", "b"], floatfmt=",.02f"))
|
|
print(tabulate.tabulate(data, headers=["a", "b"], floatfmt=".02f"))
|
|
|
|
|
|
def __groupby_issue():
|
|
# MWE of an issue with pandas groupby
|
|
import pandas as pd
|
|
|
|
data = pd.DataFrame(
|
|
[
|
|
{"p1": "a", "p2": 1, "p3": 0},
|
|
{"p1": "a", "p2": 1, "p3": 0},
|
|
{"p1": "a", "p2": 2, "p3": 0},
|
|
{"p1": "b", "p2": 2, "p3": 0},
|
|
{"p1": "b", "p2": 1, "p3": 0},
|
|
{"p1": "b", "p2": 1, "p3": 0},
|
|
{"p1": "b", "p2": 1, "p3": 0},
|
|
]
|
|
)
|
|
|
|
by = "p1"
|
|
key = list(data.groupby(by))[0][0]
|
|
result = {"by": by, "key": key, "type(key)": type(key)}
|
|
print(f"result = {ub.repr2(result, nl=1)}")
|
|
assert not ub.iterable(
|
|
key
|
|
), "`by` is specified as a scalar, so getting `key` as a scalar makes sense"
|
|
|
|
by = ["p1"]
|
|
key = list(data.groupby(by))[0][0]
|
|
result = {"by": by, "key": key, "type(key)": type(key)}
|
|
print(f"result = {ub.repr2(result, nl=1)}")
|
|
assert not ub.iterable(key), (
|
|
"`by` is specified as a list of scalars (with one element), but we "
|
|
"still get `key` as a scalar. This does not make sense"
|
|
)
|
|
|
|
by = ["p1", "p2"]
|
|
key = list(data.groupby(by))[0][0]
|
|
result = {"by": by, "key": key, "type(key)": type(key)}
|
|
print(f"result = {ub.repr2(result, nl=1)}")
|
|
assert ub.iterable(key), (
|
|
"`by` is specified as a list of scalars (with multiple elements), "
|
|
"and we still get `key` as a tuple of values. This makes sense"
|
|
)
|
|
|
|
|
|
def aggregate_stats(data, suffix="", group_keys=None):
|
|
"""
|
|
Given columns interpreted as containing stats, aggregate those stats
|
|
within each group. For each row, any non-group, non-stat column
|
|
with consistent values across that columns in the group is kept as-is,
|
|
otherwise the new column for that row is set to None.
|
|
|
|
Args:
|
|
data (DataFrame):
|
|
a data frame with columns: 'mean', 'std', 'min', 'max', and 'nobs'
|
|
(possibly with a suffix)
|
|
|
|
suffix (str):
|
|
if the nobs, std, mean, min, and max have a suffix, specify it
|
|
|
|
group_keys (List[str]):
|
|
pass
|
|
|
|
Returns:
|
|
DataFrame:
|
|
New dataframe where grouped rows have been aggregated into a single
|
|
row.
|
|
|
|
Example:
|
|
>>> import sys, ubelt
|
|
>>> sys.path.append(ubelt.expandpath('~/code/ultrajson'))
|
|
>>> from json_benchmarks.benchmarker.util_stats import * # NOQA
|
|
>>> import pandas as pd
|
|
>>> data = pd.DataFrame([
|
|
>>> #
|
|
>>> {'mean': 8, 'std': 1, 'min': 0, 'max': 1, 'nobs': 2, 'p1': 'a', 'p2': 1},
|
|
>>> {'mean': 6, 'std': 2, 'min': 0, 'max': 1, 'nobs': 3, 'p1': 'a', 'p2': 1},
|
|
>>> {'mean': 7, 'std': 3, 'min': 0, 'max': 2, 'nobs': 5, 'p1': 'a', 'p2': 2},
|
|
>>> {'mean': 5, 'std': 4, 'min': 0, 'max': 3, 'nobs': 7, 'p1': 'a', 'p2': 1},
|
|
>>> #
|
|
>>> {'mean': 3, 'std': 1, 'min': 0, 'max': 20, 'nobs': 6, 'p1': 'b', 'p2': 1},
|
|
>>> {'mean': 0, 'std': 2, 'min': 0, 'max': 20, 'nobs': 26, 'p1': 'b', 'p2': 2},
|
|
>>> {'mean': 9, 'std': 3, 'min': 0, 'max': 20, 'nobs': 496, 'p1': 'b', 'p2': 1},
|
|
>>> #
|
|
>>> {'mean': 5, 'std': 0, 'min': 0, 'max': 1, 'nobs': 2, 'p1': 'c', 'p2': 2},
|
|
>>> {'mean': 5, 'std': 0, 'min': 0, 'max': 1, 'nobs': 7, 'p1': 'c', 'p2': 2},
|
|
>>> #
|
|
>>> {'mean': 5, 'std': 2, 'min': 0, 'max': 2, 'nobs': 7, 'p1': 'd', 'p2': 2},
|
|
>>> #
|
|
>>> {'mean': 5, 'std': 2, 'min': 0, 'max': 2, 'nobs': 7, 'p1': 'e', 'p2': 1},
|
|
>>> ])
|
|
>>> print(data)
|
|
>>> new_data = aggregate_stats(data)
|
|
>>> print(new_data)
|
|
>>> new_data1 = aggregate_stats(data, group_keys=['p1'])
|
|
>>> print(new_data1)
|
|
>>> new_data2 = aggregate_stats(data, group_keys=['p2'])
|
|
>>> print(new_data2)
|
|
"""
|
|
import pandas as pd
|
|
|
|
# Stats groupings
|
|
raw_stats_cols = ["nobs", "std", "mean", "max", "min"]
|
|
stats_cols = [c + suffix for c in raw_stats_cols]
|
|
mapper = dict(zip(stats_cols, raw_stats_cols))
|
|
unmapper = dict(zip(raw_stats_cols, stats_cols))
|
|
non_stats_cols = list(ub.oset(data.columns) - stats_cols)
|
|
if group_keys is None:
|
|
group_keys = non_stats_cols
|
|
non_group_keys = list(ub.oset(non_stats_cols) - group_keys)
|
|
|
|
new_rows = []
|
|
for group_vals, group in list(data.groupby(group_keys)):
|
|
# hack, is this a pandas bug in 1.4.1? Is it fixed? (Not in 1.4.2)
|
|
if isinstance(group_keys, list) and len(group_keys) == 1:
|
|
# For some reason, when we specify group keys as a list of one
|
|
# element, we get a squeezed value out
|
|
group_vals = (group_vals,)
|
|
stat_data = group[stats_cols].rename(mapper, axis=1)
|
|
new_stats = combine_stats_arrs(stat_data)
|
|
new_time_stats = ub.map_keys(unmapper, new_stats)
|
|
new_row = ub.dzip(group_keys, group_vals)
|
|
if non_group_keys:
|
|
for k in non_group_keys:
|
|
unique_vals = group[k].unique()
|
|
if len(unique_vals) == 1:
|
|
new_row[k] = unique_vals[0]
|
|
else:
|
|
new_row[k] = None
|
|
new_row.update(new_time_stats)
|
|
new_rows.append(new_row)
|
|
new_data = pd.DataFrame(new_rows)
|
|
return new_data
|
|
|
|
|
|
def stats_dict(data, suffix=""):
|
|
stats = {
|
|
"nobs" + suffix: len(data),
|
|
"mean" + suffix: data.mean(),
|
|
"std" + suffix: data.std(),
|
|
"min" + suffix: data.min(),
|
|
"max" + suffix: data.max(),
|
|
}
|
|
return stats
|
|
|
|
|
|
def combine_stats(s1, s2):
|
|
"""
|
|
Helper for combining mean and standard deviation of multiple measurements
|
|
|
|
Args:
|
|
s1 (dict): stats dict containing mean, std, and n
|
|
s2 (dict): stats dict containing mean, std, and n
|
|
|
|
Example:
|
|
>>> basis = {
|
|
>>> 'nobs1': [1, 10, 100, 10000],
|
|
>>> 'nobs2': [1, 10, 100, 10000],
|
|
>>> }
|
|
>>> for params in ub.named_product(basis):
|
|
>>> data1 = np.random.rand(params['nobs1'])
|
|
>>> data2 = np.random.rand(params['nobs2'])
|
|
>>> data3 = np.hstack([data1, data2])
|
|
>>> s1 = stats_dict(data1)
|
|
>>> s2 = stats_dict(data2)
|
|
>>> s3 = stats_dict(data3)
|
|
>>> # Check that our combo works
|
|
>>> combo_s3 = combine_stats(s1, s2)
|
|
>>> compare = pd.DataFrame({'raw': s3, 'combo': combo_s3})
|
|
>>> print(compare)
|
|
>>> assert np.allclose(compare.raw, compare.combo)
|
|
|
|
References:
|
|
.. [SO7753002] https://stackoverflow.com/questions/7753002/adding-combining-standard-deviations
|
|
.. [SO2971315] https://math.stackexchange.com/questions/2971315/how-do-i-combine-standard-deviations-of-two-groups
|
|
"""
|
|
stats = [s1, s2]
|
|
data = {
|
|
"nobs": np.array([s["nobs"] for s in stats]),
|
|
"mean": np.array([s["mean"] for s in stats]),
|
|
"std": np.array([s["std"] for s in stats]),
|
|
"min": np.array([s["min"] for s in stats]),
|
|
"max": np.array([s["max"] for s in stats]),
|
|
}
|
|
return combine_stats_arrs(data)
|
|
|
|
|
|
def combine_stats_arrs(data):
|
|
sizes = data["nobs"]
|
|
means = data["mean"]
|
|
stds = data["std"]
|
|
mins = data["min"]
|
|
maxs = data["max"]
|
|
varis = stds * stds
|
|
|
|
# TODO: ddof
|
|
# https://github.com/Erotemic/misc/blob/28cf797b9b0f8bd82e3ebee2f6d0a688ecee2838/learn/stats.py#L128
|
|
|
|
combo_size = sizes.sum()
|
|
combo_mean = (sizes * means).sum() / combo_size
|
|
|
|
mean_deltas = means - combo_mean
|
|
|
|
sv = (sizes * varis).sum()
|
|
sm = (sizes * (mean_deltas * mean_deltas)).sum()
|
|
combo_vars = (sv + sm) / combo_size
|
|
combo_std = np.sqrt(combo_vars)
|
|
|
|
combo_stats = {
|
|
"nobs": combo_size,
|
|
"mean": combo_mean,
|
|
"std": combo_std,
|
|
"min": mins.min(),
|
|
"max": maxs.max(),
|
|
}
|
|
return combo_stats
|