jiko/rapgenius_scraper.py

## rapgenius_scraper.py
import requests # for grabbing pages
from bs4 import BeautifulSoup # for parsing pages
import codecs

# grab page
root = "http://rapgenius.com"
path = "/artists/2-chainz"
page = requests.get(root+path)

# load downloaded page into BS
html = BeautifulSoup(page.text)
song_links = []
# let's grab all the links in .song_list elements
song_links.extend((a['href'] for a in html.select(".song_list li a")))

# if i want to grab all the songs i have to make requests to get all links in the pagination div and exclude either the first or the last to avoid dupes
page_links = [ a['href'] for a in html.select(".pagination a")]
pages = list(set(page_links))
for p in pages:
    page = requests.get(root+p)
    html = BeautifulSoup(page.text)
    song_links.extend((a['href'] for a in html.select(".song_list li a")))

songs = list(set(song_links))
lyrics = []
for song in songs:
    print song
    # make a request to grab each page linked to
    page = requests.get(root+song)
    print page.status_code
    html = BeautifulSoup(page.text)
    # the div with the content we want has the class "lyrics"
    lines = html.select(".lyrics")
    TwoChainz = []
    for line in lines:
        # discard all the tags but save their content
        [a.unwrap() for a in line.select("a")]
        [br.unwrap() for br in line.select("br")]
        TwoChainz.append(line)
        # section it out by [*]
        # if the [*] has no artist name (indicated by a colon) or the artist name "2 Chainz"
        # we want those lyrics
        # otherwise, discard
    lyrics.extend(TwoChainz) # save it all into lyrics list
# write lyrics list to a text file
with codecs.open('lyrics.txt','w','utf-8') as l:
    [l.write(lyric.get_text()) for lyric in lyrics]
	import requests # for grabbing pages
	from bs4 import BeautifulSoup # for parsing pages
	import codecs

	# grab page
	root = "http://rapgenius.com"
	path = "/artists/2-chainz"
	page = requests.get(root+path)

	# load downloaded page into BS
	html = BeautifulSoup(page.text)
	song_links = []
	# let's grab all the links in .song_list elements
	song_links.extend((a['href'] for a in html.select(".song_list li a")))

	# if i want to grab all the songs i have to make requests to get all links in the pagination div and exclude either the first or the last to avoid dupes
	page_links = [ a['href'] for a in html.select(".pagination a")]
	pages = list(set(page_links))
	for p in pages:
	page = requests.get(root+p)
	html = BeautifulSoup(page.text)
	song_links.extend((a['href'] for a in html.select(".song_list li a")))

	songs = list(set(song_links))
	lyrics = []
	for song in songs:
	print song
	# make a request to grab each page linked to
	page = requests.get(root+song)
	print page.status_code
	html = BeautifulSoup(page.text)
	# the div with the content we want has the class "lyrics"
	lines = html.select(".lyrics")
	TwoChainz = []
	for line in lines:
	# discard all the tags but save their content
	[a.unwrap() for a in line.select("a")]
	[br.unwrap() for br in line.select("br")]
	TwoChainz.append(line)
	# section it out by [*]
	# if the [*] has no artist name (indicated by a colon) or the artist name "2 Chainz"
	# we want those lyrics
	# otherwise, discard
	lyrics.extend(TwoChainz) # save it all into lyrics list
	# write lyrics list to a text file
	with codecs.open('lyrics.txt','w','utf-8') as l:
	[l.write(lyric.get_text()) for lyric in lyrics]