Last active
April 21, 2017 11:10
-
-
Save Restioson/d76082158893766a7ef479df28201500 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Imports | |
from bs4 import BeautifulSoup | |
import string | |
from urllib.request import urlopen | |
import time | |
import nltk | |
import json | |
def crawl(): | |
''' Crawls lyrics on www.azlyrics.com ''' | |
# Generate list of alphabet index pages | |
index_pages = ["http://www.azlyrics.com/{0}.html".format(page) for page in string.ascii_lowercase] | |
# Add number index page | |
index_pages.append("http://www.azlyrics.com/19.html") | |
# Craw through index pages | |
for index_page_url in index_pages: | |
print(index_page_url) | |
# Get index page and load into soup | |
index_page = BeautifulSoup(urlopen(index_page_url).read(), "html.parser") | |
# Find artists | |
artists = [link for link in index_page.find_all("a", href = True) if link["href"].split("/")[0] == "a"] | |
# Sleep for 2 seconds | |
time.sleep(2) | |
# Crawl artists | |
for artist_link in artists: | |
print("http://www.azlyrics.com/" + artist_link["href"]) | |
# Get artist page and load into soup | |
artist_page = BeautifulSoup(urlopen("http://www.azlyrics.com/" + artist_link["href"]).read(), "html.parser") | |
# Get list of songs | |
songs = [link for link in artist_page.find_all("a", href = True) if len(link["href"].split("/")) > 1 and link["href"].split("/")[1] == "lyrics"] | |
# Song lyrics dict | |
song_lyrics_all = {} | |
# Crawl songs | |
for song_link in songs: | |
print(song_link) | |
# Get song page and load into soup | |
song_page = BeautifulSoup(urlopen("http://www.azlyrics.com/" + song_link["href"].replace("../", "")).read(), "html.parser") | |
# Get raw lyrics | |
lyrics_raw = "\n".join(song_page.find("div", class_ = None).text) | |
# Process lyrics | |
lyrics = process(lyrics_raw) | |
# Save into song lyrics all dict | |
song_lyrics_all[song_link["href"].split("/")[3].split(".")[0]] = lyrics | |
# Save as json | |
with open("./lyrics/" + artist_link["href"].split("/")[1], "w") as artist_file: | |
artist_file.write(json.dumps(song_lyrics_all)) | |
def process(text): | |
''' Processes text''' | |
# Lemmatizer | |
lemmatizer = nltk.WordNetLemmatizer() | |
# Tokenize string | |
tokenized = nltk.word_tokenize(text) | |
# POS tagged | |
tagged = nltk.pos_tag(tokenized) | |
# Return lemmatized and pos tagged text | |
return [(lemmatizer.lemmatize(tagged_tuple[0]), tagged_tuple[1]) for tagged_tuple in tagged] | |
# Crawl | |
crawl() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment