Created
February 9, 2018 14:20
-
-
Save theriley106/7ee065016011f39820ec941692013bb0 to your computer and use it in GitHub Desktop.
Script used to get the data in this dataset: https://www.kaggle.com/theriley106/panic-at-the-dataset/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import bs4 | |
import re | |
import json | |
from textblob import TextBlob | |
def getLyricSentiment(lyrics): | |
lyrics = re.sub('\s+',' ',lyrics) | |
return TextBlob(lyrics).sentiment.polarity | |
def grabSite(url): | |
headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0',} | |
return requests.get(url, headers=headers) | |
def genSongUrl(songTitle, baseUrl='https://www.azlyrics.com/lyrics/panicatthedisco/{0}.html'): | |
string = ''.join(re.findall('\w+', str(songTitle))).lower() | |
if len(string) > 2: | |
return baseUrl.format(string) | |
else: | |
return None | |
def grabLyrics(url): | |
res = grabSite(url) | |
page = bs4.BeautifulSoup(res.text, 'lxml') | |
lyrics = "<div>{}</div>".format(str(page).partition("<!-- Usage of azlyrics.com content by any third-party lyrics provider is prohibited by our licensing agreement. Sorry about that. -->")[2].partition("</div")[0]) | |
# I know this is an awful way of doing this ^ | |
page = bs4.BeautifulSoup(str(lyrics), 'lxml') | |
return str(page.getText()) | |
def grabAllSongs(url): | |
listOfSongs = [] | |
res = grabSite(url) | |
page = bs4.BeautifulSoup(res.text, 'lxml') | |
for i, songTitle in enumerate(page.select("#listAlbum a")): | |
try: | |
title = songTitle.getText() | |
url = genSongUrl(title) | |
if url != None: | |
lyrics = grabLyrics(url) | |
listOfSongs.append({"Title": title, "URL": url, "Lyrics": lyrics, "Sentiment": getLyricSentiment(lyrics)}) | |
except: | |
print("Error on {}".format(i)) | |
print("Done with {}".format(i)) | |
return listOfSongs | |
if __name__ == '__main__': | |
e = grabAllSongs("https://www.azlyrics.com/p/panicatthedisco.html") | |
with open('data.json', 'w') as outfile: | |
json.dump(e, outfile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment