Last active
October 1, 2017 22:39
-
-
Save seanchrismurphy/b5c2904bcea1efd2c228d2657ca38326 to your computer and use it in GitHub Desktop.
Scraping code to get number of fans for hardstyle artists
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import pandas as pd | |
import re | |
from bs4 import BeautifulSoup as bs | |
import numpy as np | |
import requests | |
import urllib | |
import time | |
# Modify this to be your desired target directory. | |
os.chdir('/Users/Username/Downloads/') | |
# Set headers to make website think we're a browser | |
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} | |
# Start from this URL that lists hardstyle artists | |
baseurl = 'https://partyflock.nl/artist/searchresult?NAME=&GENDER=&COUNTRYID=&GID%5B%5D=41&GID%5B%5D=48&GID%5B%5D=52&GID%5B%5D=55&GID%5B%5D=5&GID%5B%5D=9&GID%5B%5D=64' | |
# Retrieve the base url and parse it | |
page = requests.get(baseurl, headers = headers) | |
page = bs(page.text, 'lxml') | |
# Pull out the table that contains all the artists. | |
articles = page.find('table', attrs = {'class': 'regular'}) | |
# Find all links that also contain an ID (which seems to be the artist links) | |
artistlinks = [x.attrs['href'] for x in articles.find_all('a') if 'id' in x.attrs] | |
# Add the base URL so that the links work. | |
artistlinks = ['https://partyflock.nl/' + x for x in artistlinks] | |
# Pull out the artist IDs | |
artistids = [x.attrs['id'] for x in articles.find_all('a') if 'id' in x.attrs] | |
# Pull out the artist names from the text field | |
artistnames = [x.text for x in articles.find_all('a') if 'id' in x.attrs] | |
# Create a dataframe with all this information. | |
artistdata = pd.DataFrame({'mylinks': artistlinks, 'ids': artistids, 'names': artistnames}) | |
# Create the empty fans column | |
artistdata['fans'] = None | |
# For each row in the artist data, scan for the number of fans and if found, put it into the table. | |
for z in range(len(artistdata)): | |
print('Scanning artist number ' + str(z + 1) + ' out of ' + str(len(artistdata))) | |
# Add error handling in case the page goes down. | |
try: | |
page = requests.get(artistdata.loc[z, 'mylinks'], headers = headers) | |
except: | |
print('Failed to scan page ' + str(z + 1) + '!') | |
next | |
# Parse the html | |
page = bs(page.text, 'lxml') | |
# Guard against pages with no fan data. These will be "None" in the output | |
if page.find('a', href=re.compile("fans"), string = re.compile('^[0-9]+$')) is not None: | |
artistdata.loc[z, 'fans'] = page.find('a', href=re.compile("fans"), string = re.compile('^[0-9]+$')).text | |
# Save the data every 100 pages in case of a crash. | |
if z % 100 == 0: | |
artistdata.to_csv('Artists with number of fans.csv', index = False) | |
# Pause scraper for 2s to be polite to website | |
time.sleep(2) | |
# Write the data to csv (which will come out in the working directory you set at the top of the script) | |
artistdata.to_csv('Artists with number of fans.csv', index = False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment