Restioson/crawl_azlyrics.py

## crawl_azlyrics.py
# Imports
from bs4 import BeautifulSoup
import string
from urllib.request import urlopen
import time
import nltk
import json

def crawl():
    ''' Crawls lyrics on www.azlyrics.com '''

    # Generate list of alphabet index pages
    index_pages = ["http://www.azlyrics.com/{0}.html".format(page) for page in string.ascii_lowercase]

    # Add number index page
    index_pages.append("http://www.azlyrics.com/19.html")

    # Craw through index pages
    for index_page_url in index_pages:

        # Print
        print(index_page_url)

        # Get index page and load into soup
        index_page = BeautifulSoup(urlopen(index_page_url).read(), "html.parser")

        # Find artists
        artists = [link for link in index_page.find_all("a", href = True) if link["href"].split("/")[0] == "a"]

        # Sleep for 2 seconds
        time.sleep(2)

        # Crawl artists
        for artist_link in artists:

            # Print
            print("http://www.azlyrics.com/" + artist_link["href"])

            # Get artist page and load into soup
            artist_page = BeautifulSoup(urlopen("http://www.azlyrics.com/" + artist_link["href"]).read(), "html.parser")

            # Get list of songs
            songs = [link for link in artist_page.find_all("a", href = True) if len(link["href"].split("/")) > 1 and link["href"].split("/")[1] == "lyrics"]

            # Song lyrics dict
            song_lyrics_all = {}

            # Crawl songs
            for song_link in songs:

                # Print
                print(song_link)

                # Get song page and load into soup
                song_page = BeautifulSoup(urlopen("http://www.azlyrics.com/" + song_link["href"].replace("../", "")).read(), "html.parser")

                # Get raw lyrics
                lyrics_raw = "\n".join(song_page.find("div", class_ = None).text)

                # Process lyrics
                lyrics = process(lyrics_raw)

                # Save into song lyrics all dict
                song_lyrics_all[song_link["href"].split("/")[3].split(".")[0]] = lyrics

            # Save as json
            with open("./lyrics/" + artist_link["href"].split("/")[1], "w") as artist_file:

                artist_file.write(json.dumps(song_lyrics_all))

def process(text):
    ''' Processes text'''

    # Lemmatizer
    lemmatizer = nltk.WordNetLemmatizer()

    # Tokenize string
    tokenized = nltk.word_tokenize(text)

    # POS tagged
    tagged = nltk.pos_tag(tokenized)

    # Return lemmatized and pos tagged text
    return [(lemmatizer.lemmatize(tagged_tuple[0]), tagged_tuple[1]) for tagged_tuple in tagged]

# Crawl
crawl()
	# Imports
	from bs4 import BeautifulSoup
	import string
	from urllib.request import urlopen
	import time
	import nltk
	import json

	def crawl():
	''' Crawls lyrics on www.azlyrics.com '''

	# Generate list of alphabet index pages
	index_pages = ["http://www.azlyrics.com/{0}.html".format(page) for page in string.ascii_lowercase]

	# Add number index page
	index_pages.append("http://www.azlyrics.com/19.html")

	# Craw through index pages
	for index_page_url in index_pages:

	# Print
	print(index_page_url)

	# Get index page and load into soup
	index_page = BeautifulSoup(urlopen(index_page_url).read(), "html.parser")

	# Find artists
	artists = [link for link in index_page.find_all("a", href = True) if link["href"].split("/")[0] == "a"]

	# Sleep for 2 seconds
	time.sleep(2)

	# Crawl artists
	for artist_link in artists:

	# Print
	print("http://www.azlyrics.com/" + artist_link["href"])

	# Get artist page and load into soup
	artist_page = BeautifulSoup(urlopen("http://www.azlyrics.com/" + artist_link["href"]).read(), "html.parser")

	# Get list of songs
	songs = [link for link in artist_page.find_all("a", href = True) if len(link["href"].split("/")) > 1 and link["href"].split("/")[1] == "lyrics"]

	# Song lyrics dict
	song_lyrics_all = {}

	# Crawl songs
	for song_link in songs:

	# Print
	print(song_link)

	# Get song page and load into soup
	song_page = BeautifulSoup(urlopen("http://www.azlyrics.com/" + song_link["href"].replace("../", "")).read(), "html.parser")

	# Get raw lyrics
	lyrics_raw = "\n".join(song_page.find("div", class_ = None).text)

	# Process lyrics
	lyrics = process(lyrics_raw)

	# Save into song lyrics all dict
	song_lyrics_all[song_link["href"].split("/")[3].split(".")[0]] = lyrics

	# Save as json
	with open("./lyrics/" + artist_link["href"].split("/")[1], "w") as artist_file:

	artist_file.write(json.dumps(song_lyrics_all))

	def process(text):
	''' Processes text'''

	# Lemmatizer
	lemmatizer = nltk.WordNetLemmatizer()

	# Tokenize string
	tokenized = nltk.word_tokenize(text)

	# POS tagged
	tagged = nltk.pos_tag(tokenized)

	# Return lemmatized and pos tagged text
	return [(lemmatizer.lemmatize(tagged_tuple[0]), tagged_tuple[1]) for tagged_tuple in tagged]

	# Crawl
	crawl()