Skip to content

Instantly share code, notes, and snippets.

@Restioson
Last active April 21, 2017 11:10
Show Gist options
  • Save Restioson/d76082158893766a7ef479df28201500 to your computer and use it in GitHub Desktop.
Save Restioson/d76082158893766a7ef479df28201500 to your computer and use it in GitHub Desktop.
# Imports
from bs4 import BeautifulSoup
import string
from urllib.request import urlopen
import time
import nltk
import json
def crawl():
''' Crawls lyrics on www.azlyrics.com '''
# Generate list of alphabet index pages
index_pages = ["http://www.azlyrics.com/{0}.html".format(page) for page in string.ascii_lowercase]
# Add number index page
index_pages.append("http://www.azlyrics.com/19.html")
# Craw through index pages
for index_page_url in index_pages:
# Print
print(index_page_url)
# Get index page and load into soup
index_page = BeautifulSoup(urlopen(index_page_url).read(), "html.parser")
# Find artists
artists = [link for link in index_page.find_all("a", href = True) if link["href"].split("/")[0] == "a"]
# Sleep for 2 seconds
time.sleep(2)
# Crawl artists
for artist_link in artists:
# Print
print("http://www.azlyrics.com/" + artist_link["href"])
# Get artist page and load into soup
artist_page = BeautifulSoup(urlopen("http://www.azlyrics.com/" + artist_link["href"]).read(), "html.parser")
# Get list of songs
songs = [link for link in artist_page.find_all("a", href = True) if len(link["href"].split("/")) > 1 and link["href"].split("/")[1] == "lyrics"]
# Song lyrics dict
song_lyrics_all = {}
# Crawl songs
for song_link in songs:
# Print
print(song_link)
# Get song page and load into soup
song_page = BeautifulSoup(urlopen("http://www.azlyrics.com/" + song_link["href"].replace("../", "")).read(), "html.parser")
# Get raw lyrics
lyrics_raw = "\n".join(song_page.find("div", class_ = None).text)
# Process lyrics
lyrics = process(lyrics_raw)
# Save into song lyrics all dict
song_lyrics_all[song_link["href"].split("/")[3].split(".")[0]] = lyrics
# Save as json
with open("./lyrics/" + artist_link["href"].split("/")[1], "w") as artist_file:
artist_file.write(json.dumps(song_lyrics_all))
def process(text):
''' Processes text'''
# Lemmatizer
lemmatizer = nltk.WordNetLemmatizer()
# Tokenize string
tokenized = nltk.word_tokenize(text)
# POS tagged
tagged = nltk.pos_tag(tokenized)
# Return lemmatized and pos tagged text
return [(lemmatizer.lemmatize(tagged_tuple[0]), tagged_tuple[1]) for tagged_tuple in tagged]
# Crawl
crawl()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment