caot/swimming records analysis.py

## swimming records analysis.py
'''
* https://swimstandards.com/swimmer/
* https://www.swimcloud.com
* https://www.swimmingrank.com
* https://www.midnightsports.com/index2023.htm
'''
'''
set CONDA_PREFIX=C:\apps\anaconda3

set CONDA_DEFAULT_ENV=base
set CONDA_EXE=%CONDA_PREFIX%\Scripts\conda.exe
set CONDA_PROMPT_MODIFIER=(base)
set CONDA_PYTHON_EXE=%CONDA_PREFIX%\python.exe
set CONDA_SHLVL=1
set Path=%CONDA_PREFIX%;%CONDA_PREFIX%\Library\mingw-w64\bin;%CONDA_PREFIX%\Library\usr\bin;%CONDA_PREFIX%\Library\bin;%CONDA_PREFIX%\Scripts;%CONDA_PREFIX%\bin;%CONDA_PREFIX%\condabin;%Path%
'''

# http://gijskoot.nl/pandas/swimming/sports/records/2018/01/16/swimming-records.html
# https://en.wikipedia.org/wiki/List_of_world_records_in_swimming

import pandas as pd
import matplotlib.pyplot as plt
# import statsmodels.api as sm
# import matplotlib.pylab as plt
# from pandas.plotting import andrews_curves

import re

url = "PATH TO.html" # "C:\\XXX.html"

# with open(url, 'rt') as myfile:
#     data = myfile.read().replace("<br>", '\n')

tables = pd.read_html(url, header=1, # skiprows=2,
   encoding='utf-8', attrs={"class":"dbtable2"})

# print(tables)

df = tables[0].dropna(axis=0, how='all').dropna(axis=1, how='all')

# df.columns.str.replace(r'.*\)', '')

df.columns = df.columns.str.replace(r'.*\)', '', regex=True)

df = df[df['Event'].notna()]

print(df)

TO_DEBUG = True

if TO_DEBUG:
    pd.set_option('display.max_columns', None)
    pd.set_option('display.expand_frame_repr', False)
    # pd.set_option('max_colwidth', -1)
    pd.set_option('display.max_rows', None)


to_int = {'Pos': 'int', 'Pts': 'int', 'EventAgeCurrent': 'int', }
keys = list(to_int.keys())

# df[['Pts']] = df[['Pts']].fillna(value=0)
df[keys] = df[keys].fillna(value=0).astype('int')
# df.fillna({'Pts': 0, })

for i, e in enumerate(df['EventAgeCurrent']):
    if e < 100:
        df['EventAgeCurrent'][i] = e / 10
    if e < 10000:
        df['EventAgeCurrent'][i] = e / 10

df['Event'] = df['Event'].str.replace(r'.*\)', '', regex=True)


try:
    df['Finals'] = df['Finals'].fillna('') + ' ' + df['Prelim.'].fillna('')
except Exception as e:
    pass

print('--------------------------------------------------------------------')
print(df)
print('--------------------------------------------------------------------')

parsed_Finals = df.Finals.str.extract(
    "(?P<m>\d{1,2})?:?(?P<s>\d{2})\.(?P<ms>\d{2})*", expand=True)

# print(parsed_Finals)

df['Finals_in_seconds'] = df.Finals

Finals_in_seconds = parsed_Finals.m.astype(float).fillna(
    0) * 60 + parsed_Finals.s.astype(float) + parsed_Finals.ms.astype(float) / 100

df.Finals_in_seconds = Finals_in_seconds

df['Date ofSwim'] = pd.DatetimeIndex(data=df['Date ofSwim'])
df = df.sort_values(by=['Event', 'Date ofSwim'], ascending=True,)

print(df)

# df = df.reindex(sorted(df.columns), axis=1)
# df.sort_index(axis=1)
# df = df.sort_index(axis=1)

# https://stackoverflow.com/questions/43707620/plotting-a-time-series


def plot_50_Free():
    dfg = df.loc[df['Event'] == '50 Free']
#     dfg['Date ofSwim'] = pd.DatetimeIndex(data=dfg['Date ofSwim'])
#     dfg = dfg.sort_values(by=['Event', 'Date ofSwim'], ascending=True,)

#     print(dfg)
    dfg.plot(x='Date ofSwim', y='Finals_in_seconds')


plot_50_Free()


def plot_df_pivot_table():
    # https://stackoverflow.com/questions/38197964/pandas-plot-multiple-time-series-dataframe-into-a-single-plot
    df.pivot_table(index='Date ofSwim', columns='Event',
                   values='Finals_in_seconds').plot()

# plot_df_pivot_table()


# multiple line plot
def plot_multiple_line():
    # Valid font size are xx-small, x-small, small, medium, large, x-large,
    # xx-large, larger, smaller, None
    plt.xticks(
        rotation=45,
        horizontalalignment='right',
        fontweight='light',
        fontsize='medium',  # 'x-large'
    )

    for e in sorted(list(set(df.Event)), key=lambda x: int(x.split()[0])):
        dfg = df.loc[df['Event'] == e]

        label = list(
            map(list, zip(dfg['Date ofSwim'].dt.strftime('%Y-%m'), dfg['Finals'])))

        plt.plot(dfg['Date ofSwim'], dfg['Finals_in_seconds'],
                 marker=".", markersize=10, label=e)
        plt.title('Swimmer: %s' % swimmer)

        # plt.annotate(label, # this is the text
        #          (dfg['Date ofSwim'], dfg['Finals']), # these are the coordinates to position the label
        #          textcoords="offset points", # how to position the text
        #          xytext=(0,10), # distance from text to points (x,y)
        # ha='center') # horizontal alignment can be left, right or center

        plt.legend()
#         plt.xlabel(ax.get_xlabel(), rotation=90)


plot_multiple_line()

plt.show()
	'''
	* https://swimstandards.com/swimmer/
	* https://www.swimcloud.com
	* https://www.swimmingrank.com
	* https://www.midnightsports.com/index2023.htm
	'''
	'''
	set CONDA_PREFIX=C:\apps\anaconda3

	set CONDA_DEFAULT_ENV=base
	set CONDA_EXE=%CONDA_PREFIX%\Scripts\conda.exe
	set CONDA_PROMPT_MODIFIER=(base)
	set CONDA_PYTHON_EXE=%CONDA_PREFIX%\python.exe
	set CONDA_SHLVL=1
	set Path=%CONDA_PREFIX%;%CONDA_PREFIX%\Library\mingw-w64\bin;%CONDA_PREFIX%\Library\usr\bin;%CONDA_PREFIX%\Library\bin;%CONDA_PREFIX%\Scripts;%CONDA_PREFIX%\bin;%CONDA_PREFIX%\condabin;%Path%
	'''

	# http://gijskoot.nl/pandas/swimming/sports/records/2018/01/16/swimming-records.html
	# https://en.wikipedia.org/wiki/List_of_world_records_in_swimming

	import pandas as pd
	import matplotlib.pyplot as plt
	# import statsmodels.api as sm
	# import matplotlib.pylab as plt
	# from pandas.plotting import andrews_curves

	import re

	url = "PATH TO.html" # "C:\\XXX.html"

	# with open(url, 'rt') as myfile:
	# data = myfile.read().replace("<br>", '\n')

	tables = pd.read_html(url, header=1, # skiprows=2,
	encoding='utf-8', attrs={"class":"dbtable2"})

	# print(tables)

	df = tables[0].dropna(axis=0, how='all').dropna(axis=1, how='all')

	# df.columns.str.replace(r'.*\)', '')

	df.columns = df.columns.str.replace(r'.*\)', '', regex=True)

	df = df[df['Event'].notna()]

	print(df)

	TO_DEBUG = True

	if TO_DEBUG:
	pd.set_option('display.max_columns', None)
	pd.set_option('display.expand_frame_repr', False)
	# pd.set_option('max_colwidth', -1)
	pd.set_option('display.max_rows', None)


	to_int = {'Pos': 'int', 'Pts': 'int', 'EventAgeCurrent': 'int', }
	keys = list(to_int.keys())

	# df[['Pts']] = df[['Pts']].fillna(value=0)
	df[keys] = df[keys].fillna(value=0).astype('int')
	# df.fillna({'Pts': 0, })

	for i, e in enumerate(df['EventAgeCurrent']):
	if e < 100:
	df['EventAgeCurrent'][i] = e / 10
	if e < 10000:
	df['EventAgeCurrent'][i] = e / 10

	df['Event'] = df['Event'].str.replace(r'.*\)', '', regex=True)


	try:
	df['Finals'] = df['Finals'].fillna('') + ' ' + df['Prelim.'].fillna('')
	except Exception as e:
	pass

	print('--------------------------------------------------------------------')
	print(df)
	print('--------------------------------------------------------------------')

	parsed_Finals = df.Finals.str.extract(
	"(?P<m>\d{1,2})?:?(?P<s>\d{2})\.(?P<ms>\d{2})*", expand=True)

	# print(parsed_Finals)

	df['Finals_in_seconds'] = df.Finals

	Finals_in_seconds = parsed_Finals.m.astype(float).fillna(
	0) * 60 + parsed_Finals.s.astype(float) + parsed_Finals.ms.astype(float) / 100

	df.Finals_in_seconds = Finals_in_seconds

	df['Date ofSwim'] = pd.DatetimeIndex(data=df['Date ofSwim'])
	df = df.sort_values(by=['Event', 'Date ofSwim'], ascending=True,)

	print(df)

	# df = df.reindex(sorted(df.columns), axis=1)
	# df.sort_index(axis=1)
	# df = df.sort_index(axis=1)

	# https://stackoverflow.com/questions/43707620/plotting-a-time-series


	def plot_50_Free():
	dfg = df.loc[df['Event'] == '50 Free']
	# dfg['Date ofSwim'] = pd.DatetimeIndex(data=dfg['Date ofSwim'])
	# dfg = dfg.sort_values(by=['Event', 'Date ofSwim'], ascending=True,)

	# print(dfg)
	dfg.plot(x='Date ofSwim', y='Finals_in_seconds')


	plot_50_Free()


	def plot_df_pivot_table():
	# https://stackoverflow.com/questions/38197964/pandas-plot-multiple-time-series-dataframe-into-a-single-plot
	df.pivot_table(index='Date ofSwim', columns='Event',
	values='Finals_in_seconds').plot()

	# plot_df_pivot_table()


	# multiple line plot
	def plot_multiple_line():
	# Valid font size are xx-small, x-small, small, medium, large, x-large,
	# xx-large, larger, smaller, None
	plt.xticks(
	rotation=45,
	horizontalalignment='right',
	fontweight='light',
	fontsize='medium', # 'x-large'
	)

	for e in sorted(list(set(df.Event)), key=lambda x: int(x.split()[0])):
	dfg = df.loc[df['Event'] == e]

	label = list(
	map(list, zip(dfg['Date ofSwim'].dt.strftime('%Y-%m'), dfg['Finals'])))

	plt.plot(dfg['Date ofSwim'], dfg['Finals_in_seconds'],
	marker=".", markersize=10, label=e)
	plt.title('Swimmer: %s' % swimmer)

	# plt.annotate(label, # this is the text
	# (dfg['Date ofSwim'], dfg['Finals']), # these are the coordinates to position the label
	# textcoords="offset points", # how to position the text
	# xytext=(0,10), # distance from text to points (x,y)
	# ha='center') # horizontal alignment can be left, right or center

	plt.legend()
	# plt.xlabel(ax.get_xlabel(), rotation=90)


	plot_multiple_line()

	plt.show()