Barry1/datapredictionqualityplot.py

## datapredictionqualityplot.py
import matplotlib.pyplot
from numpy import sqrt
import pandas
from scipy.special import erfinv
from typing import Union


def dataqualityplot(
        data: pandas.DataFrame,
        x: str,
        y: str,
        modell: Union[None, str] = None,
        res: int = 50,
        ci: float = 0.95,
        scatter: bool = False,
        globalbands: bool = False,
        oldfill=False,
        **pltopts) -> None:
    sigmafactor = erfinv(ci) * sqrt(2)  # type: float
    if modell:
        wrkdata = data[[x, y, modell]].sort_values(
            x).reset_index(drop=True)  # type: pandas.DataFrame
    else:
        covges = data[[x, y]].cov()
        a = covges.loc[y, x] / covges.loc[x, x]  # type: float
        b = data[y].mean() - a * data[x].mean()  # type: float
        modell = 'Lineare Regression'  # f'Lineare Regression {a!r}x+{b!r}'
        wrkdata = data[[x, y]].sort_values(x).reset_index(drop=True)
        wrkdata.loc[:, modell] = a * wrkdata[x] + b
    assert wrkdata[x].is_monotonic
    wrkdata.loc[:, 'Error'] = wrkdata[y] - wrkdata[modell]
    grpcol = res * wrkdata.index.to_series() // len(wrkdata)
    grpobj = wrkdata.groupby(by=grpcol)
    matplotlib.pyplot.figure(**pltopts)
    ax = matplotlib.pyplot.gca()  # type: matplotlib.pyplot.axes.Axes
    grpdata = wrkdata.groupby(by=grpcol).mean()
    ax.plot(grpdata[x], grpdata[y], label=y + ' (gruppiert)')
    savecolor = ax.plot(grpdata[x], grpdata[modell],
                        label=modell + ' (gruppiert)')[0].get_color()  # type:str
    fehler: Union[float, pandas.Series]
    deltarange: Union[float, pandas.Series]
    if globalbands:
        fehler = wrkdata['Error'].mean()
        deltarange = wrkdata['Error'].std(ddof=0)
    else:
        fehler = grpdata['Error']
        deltarange = wrkdata.loc[:, 'Error'].groupby(
            by=grpcol).std(ddof=0)
    if oldfill:
        ax.fill_between(grpdata[x],
                        grpdata[modell] + fehler - sigmafactor * deltarange,
                        grpdata[modell] + fehler + sigmafactor * deltarange,
                        facecolor=savecolor,
                        step='mid',
                        alpha=.15)
    else:
        xall = grpobj.agg({x: ['first', 'last']})  # .values.flatten()
        modellall = grpobj.agg(
            {modell: ['first', 'last']})  # .values.flatten()
        modellall['flb'] = modellall[modell]['first'] + \
            fehler - sigmafactor * deltarange
        modellall['llb'] = modellall[modell]['last'] + \
            fehler - sigmafactor * deltarange
        modellall['fub'] = modellall[modell]['first'] + \
            fehler + sigmafactor * deltarange
        modellall['lub'] = modellall[modell]['last'] + \
            fehler + sigmafactor * deltarange
        ax.fill_between(xall.values.flatten(),
                        modellall[['flb', 'llb']].values.flatten() - 1,
                        modellall[['fub', 'lub']].values.flatten() + 1,
                        facecolor=savecolor,
                        alpha=.15)
    if scatter:
        ax.plot(wrkdata[x], wrkdata[y], '.', label=y)
    ax.set_xlabel(x)
    ax.set_ylabel(y)
    ax.legend()
    ax.set_title(f'Daten, Modell und {100*ci} % Bestimmtheit')


if __name__ == "__main__":
    import seaborn
    print("Selbsttest bzw. Anwendungsbeispiel")
    tips = seaborn.load_dataset("tips")
    seaborn.regplot(x="total_bill", y="tip", data=tips, scatter=False, ci=60)
    import numpy
    seaborn.regplot(x="total_bill", y="tip", data=tips,
                    x_bins=numpy.linspace(0, 50, 10), x_ci=60, ci=60)
    dataqualityplot(data=tips, x="total_bill", y="tip",
                    res=10, globalbands=True, figsize=(10, 5))
    #https://holoviews.org/reference/elements/bokeh/Spread.html
    #hv.extension('bokeh')
    #hv.help(hv.Spread)
    #xs = np.linspace(0, np.pi*2, 20)
    # spread = hv.Spread((xs, np.sin(xs), 0.1+np.random.rand(len(xs)), 0.1+np.random.rand(len(xs))),
    #                vdims=['y', 'yerrneg', 'yerrpos'])
    #spread.opts(fill_alpha=1, fill_color='indianred')
    #http://holoviews.org/getting_started/Tabular_Datasets.html ErrorBars
    # symmetric vs.
    #class Spread(ErrorBars):
    #https://holoviews.org/reference/elements/matplotlib/Area.html
	import matplotlib.pyplot
	from numpy import sqrt
	import pandas
	from scipy.special import erfinv
	from typing import Union


	def dataqualityplot(
	data: pandas.DataFrame,
	x: str,
	y: str,
	modell: Union[None, str] = None,
	res: int = 50,
	ci: float = 0.95,
	scatter: bool = False,
	globalbands: bool = False,
	oldfill=False,
	**pltopts) -> None:
	sigmafactor = erfinv(ci) * sqrt(2) # type: float
	if modell:
	wrkdata = data[[x, y, modell]].sort_values(
	x).reset_index(drop=True) # type: pandas.DataFrame
	else:
	covges = data[[x, y]].cov()
	a = covges.loc[y, x] / covges.loc[x, x] # type: float
	b = data[y].mean() - a * data[x].mean() # type: float
	modell = 'Lineare Regression' # f'Lineare Regression {a!r}x+{b!r}'
	wrkdata = data[[x, y]].sort_values(x).reset_index(drop=True)
	wrkdata.loc[:, modell] = a * wrkdata[x] + b
	assert wrkdata[x].is_monotonic
	wrkdata.loc[:, 'Error'] = wrkdata[y] - wrkdata[modell]
	grpcol = res * wrkdata.index.to_series() // len(wrkdata)
	grpobj = wrkdata.groupby(by=grpcol)
	matplotlib.pyplot.figure(**pltopts)
	ax = matplotlib.pyplot.gca() # type: matplotlib.pyplot.axes.Axes
	grpdata = wrkdata.groupby(by=grpcol).mean()
	ax.plot(grpdata[x], grpdata[y], label=y + ' (gruppiert)')
	savecolor = ax.plot(grpdata[x], grpdata[modell],
	label=modell + ' (gruppiert)')[0].get_color() # type:str
	fehler: Union[float, pandas.Series]
	deltarange: Union[float, pandas.Series]
	if globalbands:
	fehler = wrkdata['Error'].mean()
	deltarange = wrkdata['Error'].std(ddof=0)
	else:
	fehler = grpdata['Error']
	deltarange = wrkdata.loc[:, 'Error'].groupby(
	by=grpcol).std(ddof=0)
	if oldfill:
	ax.fill_between(grpdata[x],
	grpdata[modell] + fehler - sigmafactor * deltarange,
	grpdata[modell] + fehler + sigmafactor * deltarange,
	facecolor=savecolor,
	step='mid',
	alpha=.15)
	else:
	xall = grpobj.agg({x: ['first', 'last']}) # .values.flatten()
	modellall = grpobj.agg(
	{modell: ['first', 'last']}) # .values.flatten()
	modellall['flb'] = modellall[modell]['first'] + \
	fehler - sigmafactor * deltarange
	modellall['llb'] = modellall[modell]['last'] + \
	fehler - sigmafactor * deltarange
	modellall['fub'] = modellall[modell]['first'] + \
	fehler + sigmafactor * deltarange
	modellall['lub'] = modellall[modell]['last'] + \
	fehler + sigmafactor * deltarange
	ax.fill_between(xall.values.flatten(),
	modellall[['flb', 'llb']].values.flatten() - 1,
	modellall[['fub', 'lub']].values.flatten() + 1,
	facecolor=savecolor,
	alpha=.15)
	if scatter:
	ax.plot(wrkdata[x], wrkdata[y], '.', label=y)
	ax.set_xlabel(x)
	ax.set_ylabel(y)
	ax.legend()
	ax.set_title(f'Daten, Modell und {100*ci} % Bestimmtheit')


	if __name__ == "__main__":
	import seaborn
	print("Selbsttest bzw. Anwendungsbeispiel")
	tips = seaborn.load_dataset("tips")
	seaborn.regplot(x="total_bill", y="tip", data=tips, scatter=False, ci=60)
	import numpy
	seaborn.regplot(x="total_bill", y="tip", data=tips,
	x_bins=numpy.linspace(0, 50, 10), x_ci=60, ci=60)
	dataqualityplot(data=tips, x="total_bill", y="tip",
	res=10, globalbands=True, figsize=(10, 5))
	#https://holoviews.org/reference/elements/bokeh/Spread.html
	#hv.extension('bokeh')
	#hv.help(hv.Spread)
	#xs = np.linspace(0, np.pi*2, 20)
	# spread = hv.Spread((xs, np.sin(xs), 0.1+np.random.rand(len(xs)), 0.1+np.random.rand(len(xs))),
	# vdims=['y', 'yerrneg', 'yerrpos'])
	#spread.opts(fill_alpha=1, fill_color='indianred')
	#http://holoviews.org/getting_started/Tabular_Datasets.html ErrorBars
	# symmetric vs.
	#class Spread(ErrorBars):
	#https://holoviews.org/reference/elements/matplotlib/Area.html