Skip to content

Instantly share code, notes, and snippets.

@NP-chaonay
Last active May 11, 2020 12:23
Show Gist options
  • Save NP-chaonay/a629de10d9e2012dee06efb5720c3c00 to your computer and use it in GitHub Desktop.
Save NP-chaonay/a629de10d9e2012dee06efb5720c3c00 to your computer and use it in GitHub Desktop.
Data Importing on Ubuntu releases' changelogs using pandas
### Program/Data Initialization ###
import urllib.request
import pandas as pd
import itertools
def pandas_series_full_display(series):
print('-'*64)
for property_i in range(series.count()):
print(str(series.index[property_i])+': '+str(series[property_i]))
print('-'*64)
URL_BASENAME='https://changelogs.ubuntu.com/meta-release'
LTS_SUFFIX='-lts'
UNSTABLE_POSTFIX='-development'
PROPOSED_POSTFIX='-proposed'
URLS=[]
IDENTITIES=[]
DATAS={}
KNOWN_COLS=['Name', 'Version', 'Date', 'Supported', 'Description', 'Release-File', 'ReleaseNotes', 'ReleaseNotesHtml', 'UpgradeTool', 'UpgradeToolSignature']
for group in itertools.product(['',LTS_SUFFIX],['',UNSTABLE_POSTFIX,PROPOSED_POSTFIX]):
URLS+=[URL_BASENAME+''.join(group)]
for group in itertools.product(['Regular','LTS'],['','Development','Proposed']):
IDENTITIES+=['-'.join(group).strip('-')]
for data_i in range(len(IDENTITIES)):
releases=urllib.request.urlopen(URLS[data_i]).read().decode().split('\n\n')
for release_i in range(len(releases)):
releases[release_i]=releases[release_i].strip()
releases[release_i]=releases[release_i].splitlines()
for property_i in range(len(releases[release_i])):
releases[release_i][property_i]=tuple(releases[release_i][property_i].split(': '))
releases[release_i]=dict(releases[release_i])
dataframe=pd.DataFrame(releases).set_index('Dist')
col=dataframe.columns.drop(KNOWN_COLS).sort_values()
DATAS[IDENTITIES[data_i]]=dataframe[KNOWN_COLS+col.tolist()]
### Check the available columns and how they're sorted in each datasets ###
for identity in DATAS:
list(DATAS[identity].columns.values)
### Check what columns has NaN in any datasets ###
BOOL_LIST=[]
for identity in DATAS:
BOOL_LIST+=[DATAS[identity].isna().any()]
pd.concat(BOOL_LIST,axis=1).any(axis=1)
### Find and print information for the codename "focal" in each of datasets ###
for identity in DATAS:
print('[In dataset : '+identity+']')
if 'focal' in DATAS[identity].index:
pandas_series_full_display(DATAS[identity].loc['focal'])
else:
print('[Warning] Dataset '+identity+' doesn\'t have codename \'focal\'')
print('\n'*2,end='')
### Check if "focal" release in "Regular" dataset doesn't use Development-branch ReleaseNote ###
while True:
if 'Devel' in DATAS['Regular'].loc['focal'].ReleaseNotes: print('Uses Development-branch ReleaseNote')
if 'Devel' in DATAS['Regular'].loc['focal'].ReleaseNotesHtml: print('Uses Development-branch ReleaseNote (HTML)')
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment