-
-
Save ozzimpact/40c11b950805d838ef9a41aabdc2c016 to your computer and use it in GitHub Desktop.
Scrape Spotify Charts using ReadyPipe
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from readypipe import requests, starting_task, subtask, schedule, save, schedule_many | |
import datetime | |
CHART_URL = 'https://spotifycharts.com/regional/%s/daily/%s' | |
@starting_task | |
def load_todays_urls(): | |
chart_page = requests.get_dom_from_content(CHART_URL % ('global', 'latest')) #visit latest page | |
countries_set = set() | |
yesterday = (datetime.datetime.today() - datetime.timedelta(days=1)).strftime('%Y-%m-%d') #get yesterday's date | |
country_list = chart_page.xpath('//*/div[@data-type="country"]')[0] #get list of countries/regions | |
for country in country_list.xpath('descendant::li'): | |
countries_set.add(country.attrib.get('data-value', 'global')) | |
countries_list = list(countries_set) | |
tuples_list = [] | |
for j in range(len(countries_list)): #create list of date-country combos | |
tuples_list.append((countries_list[j], yesterday)) | |
schedule_many('scrape_chart', tuples_list) #schedule all for scraping | |
@subtask | |
def scrape_chart(region, date): | |
chart_page = requests.get_dom_from_content(CHART_URL % (region, date)) | |
songs = chart_page.xpath('//*/table[@class="chart-table"]/tbody/tr') | |
for song in songs: | |
_track_name = song.xpath('child::td[@class="chart-table-track"]/strong')[0].text #song name | |
_artist_name = song.xpath('child::td[@class="chart-table-track"]/span')[0].text #artist name | |
_position = song.xpath('child::td[@class="chart-table-position"]')[0].text #chart position | |
_streams = song.xpath('child::td[@class="chart-table-streams"]')[0].text #number of streams | |
data = { | |
'region': region, | |
'date': date, | |
'track_name': _track_name, | |
'artist_name': _artist_name[3:], #exclude "By " | |
'artist_name_raw': _artist_name, | |
'position': int(_position), | |
'position_raw': _position, | |
'streams': int(_streams.replace(',', '')), | |
'streams_raw': _streams, | |
} | |
save('songs', data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment