Created
January 24, 2019 16:48
-
-
Save chadmhorner/6843aaf10b5f40548e6262fe71847dcf to your computer and use it in GitHub Desktop.
Scrape Spotify Charts using ReadyPipe
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from readypipe import requests, starting_task, subtask, schedule, save, schedule_many | |
import datetime | |
CHART_URL = 'https://spotifycharts.com/regional/%s/daily/%s' | |
@starting_task | |
def load_todays_urls(): | |
chart_page = requests.get_dom_from_content(CHART_URL % ('global', 'latest')) #visit latest page | |
countries_set = set() | |
yesterday = (datetime.datetime.today() - datetime.timedelta(days=1)).strftime('%Y-%m-%d') #get yesterday's date | |
country_list = chart_page.xpath('//*/div[@data-type="country"]')[0] #get list of countries/regions | |
for country in country_list.xpath('descendant::li'): | |
countries_set.add(country.attrib.get('data-value', 'global')) | |
countries_list = list(countries_set) | |
tuples_list = [] | |
for j in range(len(countries_list)): #create list of date-country combos | |
tuples_list.append((countries_list[j], yesterday)) | |
schedule_many('scrape_chart', tuples_list) #schedule all for scraping | |
@subtask | |
def scrape_chart(region, date): | |
chart_page = requests.get_dom_from_content(CHART_URL % (region, date)) | |
songs = chart_page.xpath('//*/table[@class="chart-table"]/tbody/tr') | |
for song in songs: | |
_track_name = song.xpath('child::td[@class="chart-table-track"]/strong')[0].text #song name | |
_artist_name = song.xpath('child::td[@class="chart-table-track"]/span')[0].text #artist name | |
_position = song.xpath('child::td[@class="chart-table-position"]')[0].text #chart position | |
_streams = song.xpath('child::td[@class="chart-table-streams"]')[0].text #number of streams | |
data = { | |
'region': region, | |
'date': date, | |
'track_name': _track_name, | |
'artist_name': _artist_name[3:], #exclude "By " | |
'artist_name_raw': _artist_name, | |
'position': int(_position), | |
'position_raw': _position, | |
'streams': int(_streams.replace(',', '')), | |
'streams_raw': _streams, | |
} | |
save('songs', data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment