Skip to content

Instantly share code, notes, and snippets.

@seanchrismurphy
Last active October 1, 2017 22:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save seanchrismurphy/b5c2904bcea1efd2c228d2657ca38326 to your computer and use it in GitHub Desktop.
Save seanchrismurphy/b5c2904bcea1efd2c228d2657ca38326 to your computer and use it in GitHub Desktop.
Scraping code to get number of fans for hardstyle artists
import os
import pandas as pd
import re
from bs4 import BeautifulSoup as bs
import numpy as np
import requests
import urllib
import time
# Modify this to be your desired target directory.
os.chdir('/Users/Username/Downloads/')
# Set headers to make website think we're a browser
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
# Start from this URL that lists hardstyle artists
baseurl = 'https://partyflock.nl/artist/searchresult?NAME=&GENDER=&COUNTRYID=&GID%5B%5D=41&GID%5B%5D=48&GID%5B%5D=52&GID%5B%5D=55&GID%5B%5D=5&GID%5B%5D=9&GID%5B%5D=64'
# Retrieve the base url and parse it
page = requests.get(baseurl, headers = headers)
page = bs(page.text, 'lxml')
# Pull out the table that contains all the artists.
articles = page.find('table', attrs = {'class': 'regular'})
# Find all links that also contain an ID (which seems to be the artist links)
artistlinks = [x.attrs['href'] for x in articles.find_all('a') if 'id' in x.attrs]
# Add the base URL so that the links work.
artistlinks = ['https://partyflock.nl/' + x for x in artistlinks]
# Pull out the artist IDs
artistids = [x.attrs['id'] for x in articles.find_all('a') if 'id' in x.attrs]
# Pull out the artist names from the text field
artistnames = [x.text for x in articles.find_all('a') if 'id' in x.attrs]
# Create a dataframe with all this information.
artistdata = pd.DataFrame({'mylinks': artistlinks, 'ids': artistids, 'names': artistnames})
# Create the empty fans column
artistdata['fans'] = None
# For each row in the artist data, scan for the number of fans and if found, put it into the table.
for z in range(len(artistdata)):
print('Scanning artist number ' + str(z + 1) + ' out of ' + str(len(artistdata)))
# Add error handling in case the page goes down.
try:
page = requests.get(artistdata.loc[z, 'mylinks'], headers = headers)
except:
print('Failed to scan page ' + str(z + 1) + '!')
next
# Parse the html
page = bs(page.text, 'lxml')
# Guard against pages with no fan data. These will be "None" in the output
if page.find('a', href=re.compile("fans"), string = re.compile('^[0-9]+$')) is not None:
artistdata.loc[z, 'fans'] = page.find('a', href=re.compile("fans"), string = re.compile('^[0-9]+$')).text
# Save the data every 100 pages in case of a crash.
if z % 100 == 0:
artistdata.to_csv('Artists with number of fans.csv', index = False)
# Pause scraper for 2s to be polite to website
time.sleep(2)
# Write the data to csv (which will come out in the working directory you set at the top of the script)
artistdata.to_csv('Artists with number of fans.csv', index = False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment