Last active
December 10, 2023 00:12
-
-
Save caot/b22e9e870dbe01cd5c351306ec26a3c8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
* https://swimstandards.com/swimmer/ | |
* https://www.swimcloud.com | |
* https://www.swimmingrank.com | |
* https://www.midnightsports.com/index2023.htm | |
''' | |
''' | |
set CONDA_PREFIX=C:\apps\anaconda3 | |
set CONDA_DEFAULT_ENV=base | |
set CONDA_EXE=%CONDA_PREFIX%\Scripts\conda.exe | |
set CONDA_PROMPT_MODIFIER=(base) | |
set CONDA_PYTHON_EXE=%CONDA_PREFIX%\python.exe | |
set CONDA_SHLVL=1 | |
set Path=%CONDA_PREFIX%;%CONDA_PREFIX%\Library\mingw-w64\bin;%CONDA_PREFIX%\Library\usr\bin;%CONDA_PREFIX%\Library\bin;%CONDA_PREFIX%\Scripts;%CONDA_PREFIX%\bin;%CONDA_PREFIX%\condabin;%Path% | |
''' | |
# http://gijskoot.nl/pandas/swimming/sports/records/2018/01/16/swimming-records.html | |
# https://en.wikipedia.org/wiki/List_of_world_records_in_swimming | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
# import statsmodels.api as sm | |
# import matplotlib.pylab as plt | |
# from pandas.plotting import andrews_curves | |
import re | |
url = "PATH TO.html" # "C:\\XXX.html" | |
# with open(url, 'rt') as myfile: | |
# data = myfile.read().replace("<br>", '\n') | |
tables = pd.read_html(url, header=1, # skiprows=2, | |
encoding='utf-8', attrs={"class":"dbtable2"}) | |
# print(tables) | |
df = tables[0].dropna(axis=0, how='all').dropna(axis=1, how='all') | |
# df.columns.str.replace(r'.*\)', '') | |
df.columns = df.columns.str.replace(r'.*\)', '', regex=True) | |
df = df[df['Event'].notna()] | |
print(df) | |
TO_DEBUG = True | |
if TO_DEBUG: | |
pd.set_option('display.max_columns', None) | |
pd.set_option('display.expand_frame_repr', False) | |
# pd.set_option('max_colwidth', -1) | |
pd.set_option('display.max_rows', None) | |
to_int = {'Pos': 'int', 'Pts': 'int', 'EventAgeCurrent': 'int', } | |
keys = list(to_int.keys()) | |
# df[['Pts']] = df[['Pts']].fillna(value=0) | |
df[keys] = df[keys].fillna(value=0).astype('int') | |
# df.fillna({'Pts': 0, }) | |
for i, e in enumerate(df['EventAgeCurrent']): | |
if e < 100: | |
df['EventAgeCurrent'][i] = e / 10 | |
if e < 10000: | |
df['EventAgeCurrent'][i] = e / 10 | |
df['Event'] = df['Event'].str.replace(r'.*\)', '', regex=True) | |
try: | |
df['Finals'] = df['Finals'].fillna('') + ' ' + df['Prelim.'].fillna('') | |
except Exception as e: | |
pass | |
print('--------------------------------------------------------------------') | |
print(df) | |
print('--------------------------------------------------------------------') | |
parsed_Finals = df.Finals.str.extract( | |
"(?P<m>\d{1,2})?:?(?P<s>\d{2})\.(?P<ms>\d{2})*", expand=True) | |
# print(parsed_Finals) | |
df['Finals_in_seconds'] = df.Finals | |
Finals_in_seconds = parsed_Finals.m.astype(float).fillna( | |
0) * 60 + parsed_Finals.s.astype(float) + parsed_Finals.ms.astype(float) / 100 | |
df.Finals_in_seconds = Finals_in_seconds | |
df['Date ofSwim'] = pd.DatetimeIndex(data=df['Date ofSwim']) | |
df = df.sort_values(by=['Event', 'Date ofSwim'], ascending=True,) | |
print(df) | |
# df = df.reindex(sorted(df.columns), axis=1) | |
# df.sort_index(axis=1) | |
# df = df.sort_index(axis=1) | |
# https://stackoverflow.com/questions/43707620/plotting-a-time-series | |
def plot_50_Free(): | |
dfg = df.loc[df['Event'] == '50 Free'] | |
# dfg['Date ofSwim'] = pd.DatetimeIndex(data=dfg['Date ofSwim']) | |
# dfg = dfg.sort_values(by=['Event', 'Date ofSwim'], ascending=True,) | |
# print(dfg) | |
dfg.plot(x='Date ofSwim', y='Finals_in_seconds') | |
plot_50_Free() | |
def plot_df_pivot_table(): | |
# https://stackoverflow.com/questions/38197964/pandas-plot-multiple-time-series-dataframe-into-a-single-plot | |
df.pivot_table(index='Date ofSwim', columns='Event', | |
values='Finals_in_seconds').plot() | |
# plot_df_pivot_table() | |
# multiple line plot | |
def plot_multiple_line(): | |
# Valid font size are xx-small, x-small, small, medium, large, x-large, | |
# xx-large, larger, smaller, None | |
plt.xticks( | |
rotation=45, | |
horizontalalignment='right', | |
fontweight='light', | |
fontsize='medium', # 'x-large' | |
) | |
for e in sorted(list(set(df.Event)), key=lambda x: int(x.split()[0])): | |
dfg = df.loc[df['Event'] == e] | |
label = list( | |
map(list, zip(dfg['Date ofSwim'].dt.strftime('%Y-%m'), dfg['Finals']))) | |
plt.plot(dfg['Date ofSwim'], dfg['Finals_in_seconds'], | |
marker=".", markersize=10, label=e) | |
plt.title('Swimmer: %s' % swimmer) | |
# plt.annotate(label, # this is the text | |
# (dfg['Date ofSwim'], dfg['Finals']), # these are the coordinates to position the label | |
# textcoords="offset points", # how to position the text | |
# xytext=(0,10), # distance from text to points (x,y) | |
# ha='center') # horizontal alignment can be left, right or center | |
plt.legend() | |
# plt.xlabel(ax.get_xlabel(), rotation=90) | |
plot_multiple_line() | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
*** Swimming records cleanup and analysis
Drawing from Data