Last active
November 10, 2018 22:26
-
-
Save weallwegot/d01afbcb544b9264b323b26e7aae358a to your computer and use it in GitHub Desktop.
Scraping Az Lyrics For All Artists Songs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
managedkaos/azlyrics.py | |
https://gist.github.com/managedkaos/e3262b80154129cc9a976ee6ee943da3 | |
""" | |
# Requests is a library that allows you to programmatically send out http requests | |
import requests | |
# os is a library for doing operating system things, like looking through file directories | |
import os | |
import time | |
import logging | |
import random | |
# BeautifulSoupp is a library made to allow developers to parse through the contents of a webpage | |
from bs4 import BeautifulSoup | |
logger = logging.getLogger('rap_webscraper.{}'.format(__name__)) | |
logger.setLevel(logging.DEBUG) | |
# create file handler which logs even debug messages | |
fh = logging.FileHandler('rap_webscrape.log') | |
fh.setLevel(logging.DEBUG) | |
# create console handler with a higher log level | |
ch = logging.StreamHandler() | |
ch.setLevel(logging.INFO) | |
# create formatter and add it to the handlers | |
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') | |
fh.setFormatter(formatter) | |
ch.setFormatter(formatter) | |
# add the handlers to the logger | |
logger.addHandler(fh) | |
logger.addHandler(ch) | |
SLEEP_TIME = 17.212 | |
NOISE = (-8.438,10.173) | |
# urls for the artists you're interested in | |
urls = ["https://www.azlyrics.com/j/jcole.html", | |
"https://www.azlyrics.com/i/isaiahrashad.html", | |
"https://www.azlyrics.com/a/absoul.html", | |
"https://www.azlyrics.com/j/joeybada.html", | |
"https://www.azlyrics.com/j/jadakiss.html"] | |
# act like a mac when requesting url | |
headers = {'User-Agent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) \ | |
AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.112 Safari/534.30"} | |
for url in urls: | |
""" | |
sleep before request | |
you could check if the artists directory exists and skip first | |
but if a previous scrape session ended midway you wouldnt check for any missed songs | |
if the url lists gets too long you may get blocked just from the initial request to artist pages | |
""" | |
logger.info('Requesting songs from: {}'.format(url)) | |
pre_request_sleep = SLEEP_TIME+random.uniform(NOISE[0],NOISE[1]) | |
logger.info('Sleeping: {}'.format(pre_request_sleep)) | |
time.sleep(pre_request_sleep) | |
# make a request for the data | |
r = requests.get(url, headers=headers) | |
# convert the response text to soup | |
soup = BeautifulSoup(r.text, "lxml") | |
# get the songs and links to the lyrics | |
lyrics_map = {} | |
artists_file_directory = url.split('/')[-1].replace('.html','') | |
for song_link in soup.find_all("a", href=True): | |
if len(song_link.text) == 0: | |
continue | |
lyrics_map[song_link.text] = song_link['href'] | |
lyric_url = song_link['href'] | |
if ".." in lyric_url: | |
lyric_url = "https://www.azlyrics.com"+lyric_url[2:] | |
filename = song_link.text.replace(' ','_').replace("'",'').replace('/','') | |
filename += ".txt" | |
filename = os.path.join("scraped_data",artists_file_directory,filename) | |
filename = filename.encode('utf-8') | |
if os.path.exists(filename): | |
try: | |
logger.debug('File {} already exists, skipping web request'.format(filename.encode('utf-8'))) | |
except: | |
continue | |
continue | |
try: | |
logger.debug('Requesting: {}'.format(lyric_url)) | |
except: | |
pass | |
""" | |
sleep for some time (in seconds) so you arent banned from sites.. | |
add some random noise to the sleep so it don't look like a robot | |
""" | |
this_sleep = SLEEP_TIME+random.uniform(NOISE[0],NOISE[1]) | |
logger.info('Sleeping: {}'.format(this_sleep)) | |
time.sleep(this_sleep) | |
response = requests.get(lyric_url, headers=headers) | |
new_soup = BeautifulSoup(response.text,"lxml") | |
try: | |
logger.debug('Will Write to: {}'.format(filename)) | |
except: | |
pass | |
# https://stackoverflow.com/questions/12517451/automatically-creating-directories-with-file-output | |
if not os.path.exists(os.path.dirname(filename)): | |
try: | |
os.makedirs(os.path.dirname(filename)) | |
except OSError as exc: # Guard against race condition | |
if exc.errno != errno.EEXIST: | |
raise | |
f = open(filename,"w+") | |
# loop through the no clas divs. they contain the lyrics | |
for lyric in new_soup.find_all("div",{"class":None}): | |
try: | |
f = open(filename,"a") | |
except IOError: | |
logger.warning('IOError could not write filename: {}'.format(filename)) | |
continue | |
try: | |
f.write(lyric.text.encode('utf-8')) | |
except UnicodeError: | |
logger.warning('UnicodeError, Skipping: {}'.format(filename)) | |
f.close() | |
continue | |
# the song panel div has the album name and the year | |
for song_panel_div in new_soup.find_all("div",{"class":"panel songlist-panel noprint"}): | |
try: | |
f.write('ALBUM INFO') | |
f.write(song_panel_div.text.encode('utf-8')) | |
except UnicodeError: | |
logger.warning('UnicodeError, Skipping') | |
f.close() | |
continue | |
f.close() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment