Skip to content

Instantly share code, notes, and snippets.

@Krazete
Last active October 23, 2020 06:09
Show Gist options
  • Save Krazete/a6ca6ad52c644fc1444320471a4bc0e6 to your computer and use it in GitHub Desktop.
Save Krazete/a6ca6ad52c644fc1444320471a4bc0e6 to your computer and use it in GitHub Desktop.
Webscrape the VTuber Rankings List
import bs4 as bs
import urllib.request
import json
vrank = {}
def processRankingPage(i, redos=1): # 1 page ~ 1.5 minutes
url = 'https://virtual-youtuber.userlocal.jp/document/ranking?page={}'.format(i + 1)
try:
html = urllib.request.urlopen(url).read()
soup = bs.BeautifulSoup(html, 'lxml')
for tr in soup.find_all('tr', {'data-href': True}):
path = tr.get('data-href')
channelid = path.split('/')[-1]
nm = tr.find('span', {'class': 'text-secondary'})
if not nm:
nm = tr.find('td', {'class': 'col-name'}).find('a', {'class': 'no-propagation'})
processUserPage(channelid, nm)
except Exception as e:
if (redos):
processRankingPage(i, redos - 1)
else:
print(url)
print(e)
def processUserPage(channelid, nm, redos=1):
url = 'https://virtual-youtuber.userlocal.jp/user/{}'.format(channelid)
try:
html = urllib.request.urlopen(url).read()
soup = bs.BeautifulSoup(html, 'lxml')
span = soup.find('span', {'id': 'stats'})
data = json.loads(span.text)
vrank.setdefault(channelid, {})
vrank[channelid].setdefault('nm', nm.text.strip())
vrank[channelid].setdefault('yt', data.get('fav_cnt', {}).get('a_month', []))
vrank[channelid].setdefault('tw', data.get('linked_cnt', {}).get('a_month', []))
except Exception as e:
if (redos):
processUserPage(channelid, nm, redos - 1)
else:
print(url)
print(e)
def saveProfile(profile):
dates = set()
for channelid in vrank:
for datepoint in vrank[channelid][profile]:
dates.add(datepoint[0])
sdates = list(dates)
sdates.sort()
csv = 'Channel ID,Channel Name,{}'.format(','.join(sdates))
for channelid in vrank:
csv += '\n{},{},'.format(channelid, vrank[channelid]['nm'])
for date in sdates:
for datepoint in vrank[channelid][profile]:
if (datepoint[0] == date):
csv += '{},'.format(datepoint[1])
break
with open('{}.csv'.format(profile), 'w') as fp:
fp.write(csv)
if __name__ == '__main__':
for i in range(5):
processRankingPage(i)
saveProfile('tw')
saveProfile('yt')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment