Skip to content

Instantly share code, notes, and snippets.

@jiko
Created November 4, 2013 17:06
Show Gist options
  • Save jiko/7305816 to your computer and use it in GitHub Desktop.
Save jiko/7305816 to your computer and use it in GitHub Desktop.
Scrapes Rap Genius for song lyrics. Leaves some mess to clean up in the output.
import requests # for grabbing pages
from bs4 import BeautifulSoup # for parsing pages
import codecs
# grab page
root = "http://rapgenius.com"
path = "/artists/2-chainz"
page = requests.get(root+path)
# load downloaded page into BS
html = BeautifulSoup(page.text)
song_links = []
# let's grab all the links in .song_list elements
song_links.extend((a['href'] for a in html.select(".song_list li a")))
# if i want to grab all the songs i have to make requests to get all links in the pagination div and exclude either the first or the last to avoid dupes
page_links = [ a['href'] for a in html.select(".pagination a")]
pages = list(set(page_links))
for p in pages:
page = requests.get(root+p)
html = BeautifulSoup(page.text)
song_links.extend((a['href'] for a in html.select(".song_list li a")))
songs = list(set(song_links))
lyrics = []
for song in songs:
print song
# make a request to grab each page linked to
page = requests.get(root+song)
print page.status_code
html = BeautifulSoup(page.text)
# the div with the content we want has the class "lyrics"
lines = html.select(".lyrics")
TwoChainz = []
for line in lines:
# discard all the tags but save their content
[a.unwrap() for a in line.select("a")]
[br.unwrap() for br in line.select("br")]
TwoChainz.append(line)
# section it out by [*]
# if the [*] has no artist name (indicated by a colon) or the artist name "2 Chainz"
# we want those lyrics
# otherwise, discard
lyrics.extend(TwoChainz) # save it all into lyrics list
# write lyrics list to a text file
with codecs.open('lyrics.txt','w','utf-8') as l:
[l.write(lyric.get_text()) for lyric in lyrics]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment