Created
November 4, 2013 17:06
-
-
Save jiko/7305816 to your computer and use it in GitHub Desktop.
Scrapes Rap Genius for song lyrics. Leaves some mess to clean up in the output.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests # for grabbing pages | |
from bs4 import BeautifulSoup # for parsing pages | |
import codecs | |
# grab page | |
root = "http://rapgenius.com" | |
path = "/artists/2-chainz" | |
page = requests.get(root+path) | |
# load downloaded page into BS | |
html = BeautifulSoup(page.text) | |
song_links = [] | |
# let's grab all the links in .song_list elements | |
song_links.extend((a['href'] for a in html.select(".song_list li a"))) | |
# if i want to grab all the songs i have to make requests to get all links in the pagination div and exclude either the first or the last to avoid dupes | |
page_links = [ a['href'] for a in html.select(".pagination a")] | |
pages = list(set(page_links)) | |
for p in pages: | |
page = requests.get(root+p) | |
html = BeautifulSoup(page.text) | |
song_links.extend((a['href'] for a in html.select(".song_list li a"))) | |
songs = list(set(song_links)) | |
lyrics = [] | |
for song in songs: | |
print song | |
# make a request to grab each page linked to | |
page = requests.get(root+song) | |
print page.status_code | |
html = BeautifulSoup(page.text) | |
# the div with the content we want has the class "lyrics" | |
lines = html.select(".lyrics") | |
TwoChainz = [] | |
for line in lines: | |
# discard all the tags but save their content | |
[a.unwrap() for a in line.select("a")] | |
[br.unwrap() for br in line.select("br")] | |
TwoChainz.append(line) | |
# section it out by [*] | |
# if the [*] has no artist name (indicated by a colon) or the artist name "2 Chainz" | |
# we want those lyrics | |
# otherwise, discard | |
lyrics.extend(TwoChainz) # save it all into lyrics list | |
# write lyrics list to a text file | |
with codecs.open('lyrics.txt','w','utf-8') as l: | |
[l.write(lyric.get_text()) for lyric in lyrics] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment