seanchrismurphy/download_hardstyle_fans.py

## download_hardstyle_fans.py
import os
import pandas as pd
import re
from bs4 import BeautifulSoup as bs
import numpy as np
import requests
import urllib
import time

# Modify this to be your desired target directory.
os.chdir('/Users/Username/Downloads/')

# Set headers to make website think we're a browser
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

# Start from this URL that lists hardstyle artists
baseurl = 'https://partyflock.nl/artist/searchresult?NAME=&GENDER=&COUNTRYID=&GID%5B%5D=41&GID%5B%5D=48&GID%5B%5D=52&GID%5B%5D=55&GID%5B%5D=5&GID%5B%5D=9&GID%5B%5D=64'

# Retrieve the base url and parse it
page = requests.get(baseurl, headers = headers)
page = bs(page.text, 'lxml')


# Pull out the table that contains all the artists.
articles = page.find('table', attrs = {'class': 'regular'})

# Find all links that also contain an ID (which seems to be the artist links)
artistlinks = [x.attrs['href'] for x in articles.find_all('a') if 'id' in x.attrs]

# Add the base URL so that the links work.
artistlinks = ['https://partyflock.nl/' + x for x in artistlinks]

# Pull out the artist IDs
artistids = [x.attrs['id'] for x in articles.find_all('a') if 'id' in x.attrs]

# Pull out the artist names from the text field
artistnames = [x.text for x in articles.find_all('a') if 'id' in x.attrs]

# Create a dataframe with all this information.
artistdata = pd.DataFrame({'mylinks': artistlinks, 'ids': artistids, 'names': artistnames})

# Create the empty fans column
artistdata['fans'] = None

# For each row in the artist data, scan for the number of fans and if found, put it into the table.
for z in range(len(artistdata)):
    print('Scanning artist number ' + str(z + 1) + ' out of ' + str(len(artistdata)))

    # Add error handling in case the page goes down.
    try:
        page = requests.get(artistdata.loc[z, 'mylinks'], headers = headers)
    except:
        print('Failed to scan page ' + str(z + 1) + '!')
        next

    # Parse the html
    page = bs(page.text, 'lxml')

    # Guard against pages with no fan data. These will be "None" in the output
    if page.find('a', href=re.compile("fans"), string = re.compile('^[0-9]+$')) is not None:
        artistdata.loc[z, 'fans'] = page.find('a', href=re.compile("fans"), string = re.compile('^[0-9]+$')).text

    # Save the data every 100 pages in case of a crash.
    if z % 100 == 0:
        artistdata.to_csv('Artists with number of fans.csv', index = False)

    # Pause scraper for 2s to be polite to website
    time.sleep(2)

# Write the data to csv (which will come out in the working directory you set at the top of the script)
artistdata.to_csv('Artists with number of fans.csv', index = False)
	import os
	import pandas as pd
	import re
	from bs4 import BeautifulSoup as bs
	import numpy as np
	import requests
	import urllib
	import time

	# Modify this to be your desired target directory.
	os.chdir('/Users/Username/Downloads/')

	# Set headers to make website think we're a browser
	headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

	# Start from this URL that lists hardstyle artists
	baseurl = 'https://partyflock.nl/artist/searchresult?NAME=&GENDER=&COUNTRYID=&GID%5B%5D=41&GID%5B%5D=48&GID%5B%5D=52&GID%5B%5D=55&GID%5B%5D=5&GID%5B%5D=9&GID%5B%5D=64'

	# Retrieve the base url and parse it
	page = requests.get(baseurl, headers = headers)
	page = bs(page.text, 'lxml')


	# Pull out the table that contains all the artists.
	articles = page.find('table', attrs = {'class': 'regular'})

	# Find all links that also contain an ID (which seems to be the artist links)
	artistlinks = [x.attrs['href'] for x in articles.find_all('a') if 'id' in x.attrs]

	# Add the base URL so that the links work.
	artistlinks = ['https://partyflock.nl/' + x for x in artistlinks]

	# Pull out the artist IDs
	artistids = [x.attrs['id'] for x in articles.find_all('a') if 'id' in x.attrs]

	# Pull out the artist names from the text field
	artistnames = [x.text for x in articles.find_all('a') if 'id' in x.attrs]

	# Create a dataframe with all this information.
	artistdata = pd.DataFrame({'mylinks': artistlinks, 'ids': artistids, 'names': artistnames})

	# Create the empty fans column
	artistdata['fans'] = None

	# For each row in the artist data, scan for the number of fans and if found, put it into the table.
	for z in range(len(artistdata)):
	print('Scanning artist number ' + str(z + 1) + ' out of ' + str(len(artistdata)))

	# Add error handling in case the page goes down.
	try:
	page = requests.get(artistdata.loc[z, 'mylinks'], headers = headers)
	except:
	print('Failed to scan page ' + str(z + 1) + '!')
	next

	# Parse the html
	page = bs(page.text, 'lxml')

	# Guard against pages with no fan data. These will be "None" in the output
	if page.find('a', href=re.compile("fans"), string = re.compile('^[0-9]+$')) is not None:
	artistdata.loc[z, 'fans'] = page.find('a', href=re.compile("fans"), string = re.compile('^[0-9]+$')).text

	# Save the data every 100 pages in case of a crash.
	if z % 100 == 0:
	artistdata.to_csv('Artists with number of fans.csv', index = False)

	# Pause scraper for 2s to be polite to website
	time.sleep(2)

	# Write the data to csv (which will come out in the working directory you set at the top of the script)
	artistdata.to_csv('Artists with number of fans.csv', index = False)