Skip to content

Instantly share code, notes, and snippets.

@chadmhorner
Created January 24, 2019 16:48
Show Gist options
  • Save chadmhorner/6843aaf10b5f40548e6262fe71847dcf to your computer and use it in GitHub Desktop.
Save chadmhorner/6843aaf10b5f40548e6262fe71847dcf to your computer and use it in GitHub Desktop.
Scrape Spotify Charts using ReadyPipe
from readypipe import requests, starting_task, subtask, schedule, save, schedule_many
import datetime
CHART_URL = 'https://spotifycharts.com/regional/%s/daily/%s'
@starting_task
def load_todays_urls():
chart_page = requests.get_dom_from_content(CHART_URL % ('global', 'latest')) #visit latest page
countries_set = set()
yesterday = (datetime.datetime.today() - datetime.timedelta(days=1)).strftime('%Y-%m-%d') #get yesterday's date
country_list = chart_page.xpath('//*/div[@data-type="country"]')[0] #get list of countries/regions
for country in country_list.xpath('descendant::li'):
countries_set.add(country.attrib.get('data-value', 'global'))
countries_list = list(countries_set)
tuples_list = []
for j in range(len(countries_list)): #create list of date-country combos
tuples_list.append((countries_list[j], yesterday))
schedule_many('scrape_chart', tuples_list) #schedule all for scraping
@subtask
def scrape_chart(region, date):
chart_page = requests.get_dom_from_content(CHART_URL % (region, date))
songs = chart_page.xpath('//*/table[@class="chart-table"]/tbody/tr')
for song in songs:
_track_name = song.xpath('child::td[@class="chart-table-track"]/strong')[0].text #song name
_artist_name = song.xpath('child::td[@class="chart-table-track"]/span')[0].text #artist name
_position = song.xpath('child::td[@class="chart-table-position"]')[0].text #chart position
_streams = song.xpath('child::td[@class="chart-table-streams"]')[0].text #number of streams
data = {
'region': region,
'date': date,
'track_name': _track_name,
'artist_name': _artist_name[3:], #exclude "By "
'artist_name_raw': _artist_name,
'position': int(_position),
'position_raw': _position,
'streams': int(_streams.replace(',', '')),
'streams_raw': _streams,
}
save('songs', data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment