andlima/scrape_letras_terra.py

## scrape_letras_terra.py
"""

A short script that, given an artist, scrapes all his/her/their songs
from 'http://letras.terra.com.br'

Dependence: BeautifulSoup, to parse the lyrics from each song page.

Author: Andre Lima - http://github.com/andlima

Licensed under MIT License

Disclaimer: Sorry for the ugly code and lack of documentation. I'll try
to give it some love later.

"""

import sys
import os
import re
import urllib

from BeautifulSoup import BeautifulSoup


URL_BASE = ['http://letras.terra.com.br']
ELEMSUG = '<a class="elemsug" href="(?P<song>[^"]+)">'
DIV_LETRA = '<div id="div_letra">(?P<lyrics>.+)</div>'
CACHE = ['.', 'cache']


def make_sure_dir_exists(d):
    try:
        os.mkdir(d)
    except OSError:
        pass

def get_url(path):
    return '/'.join(URL_BASE + path)

def get_file_name(path):
    return '/'.join(CACHE + path)

def content_for_path(path):
    """
    Receives a `path` represented as a list, like:
     - ['chico-buarque']
     - ['legiao-urbana', '1234']
    Returns its content as a list of strings (lines)
    """

    if len(path) == 1:
        make_sure_dir_exists(get_file_name(path))
        path2 = path + ['root']
    else:
        make_sure_dir_exists(get_file_name(path[:-1]))
        path2 = path

    try:
        with open(get_file_name(path2), 'r') as f:
            lines = [line.strip() for line in f]
    except IOError:
        x = urllib.urlopen(get_url(path))
        lines = x.readlines()
        with open(get_file_name(path2), 'w') as f:
            f.writelines(lines)
            lines = [line.strip() for line in lines]

    return lines

def fix_lyrics(lyrics):
    substitutions = [
        ('<p>', '<strophe> <verse> '),
        ('</p>', ' </verse> </strophe> '),
        ('<br />', ' </verse> <verse> '),
        ('  ', ' '),
    ]

    lyrics = '<song> ' + lyrics + ' </song>'

    for a, b in substitutions:
        lyrics = lyrics.replace(a, b)

    return lyrics

if __name__ == '__main__':
    make_sure_dir_exists('/'.join(CACHE))

    try:
        path = [sys.argv[1]]
    except IndexError:
        # Examples:
        # path = ['engenheiros-do-hawaii']
        # path = ['legiao-urbana']
        # path = ['chico-buarque']
        path = ['the-beatles']

    song_paths = []
    for line in content_for_path(path):
        song_paths.extend([m.group('song').strip('/').split('/')
                           for m in re.finditer(ELEMSUG, line)])

    with open(get_file_name(path + ['processed']), 'w') as output:
        for song_path in song_paths:
            print 'processing', song_path
            content = ' '.join(content_for_path(song_path))
            soup = BeautifulSoup(content)
            lhtml = soup.find('div', id='div_letra')
            lyrics = ' '.join([str(p) for (i, p) in enumerate(lhtml.contents)
                               if i % 2 == 1])
            output.write(fix_lyrics(lyrics) + '\n')
            output.flush()
	"""

	A short script that, given an artist, scrapes all his/her/their songs
	from 'http://letras.terra.com.br'

	Dependence: BeautifulSoup, to parse the lyrics from each song page.

	Author: Andre Lima - http://github.com/andlima

	Licensed under MIT License

	Disclaimer: Sorry for the ugly code and lack of documentation. I'll try
	to give it some love later.

	"""

	import sys
	import os
	import re
	import urllib

	from BeautifulSoup import BeautifulSoup


	URL_BASE = ['http://letras.terra.com.br']
	ELEMSUG = '<a class="elemsug" href="(?P<song>[^"]+)">'
	DIV_LETRA = '<div id="div_letra">(?P<lyrics>.+)</div>'
	CACHE = ['.', 'cache']


	def make_sure_dir_exists(d):
	try:
	os.mkdir(d)
	except OSError:
	pass

	def get_url(path):
	return '/'.join(URL_BASE + path)

	def get_file_name(path):
	return '/'.join(CACHE + path)

	def content_for_path(path):
	"""
	Receives a `path` represented as a list, like:
	- ['chico-buarque']
	- ['legiao-urbana', '1234']
	Returns its content as a list of strings (lines)
	"""

	if len(path) == 1:
	make_sure_dir_exists(get_file_name(path))
	path2 = path + ['root']
	else:
	make_sure_dir_exists(get_file_name(path[:-1]))
	path2 = path

	try:
	with open(get_file_name(path2), 'r') as f:
	lines = [line.strip() for line in f]
	except IOError:
	x = urllib.urlopen(get_url(path))
	lines = x.readlines()
	with open(get_file_name(path2), 'w') as f:
	f.writelines(lines)
	lines = [line.strip() for line in lines]

	return lines

	def fix_lyrics(lyrics):
	substitutions = [
	('<p>', '<strophe> <verse> '),
	('</p>', ' </verse> </strophe> '),
	('<br />', ' </verse> <verse> '),
	(' ', ' '),
	]

	lyrics = '<song> ' + lyrics + ' </song>'

	for a, b in substitutions:
	lyrics = lyrics.replace(a, b)

	return lyrics

	if __name__ == '__main__':
	make_sure_dir_exists('/'.join(CACHE))

	try:
	path = [sys.argv[1]]
	except IndexError:
	# Examples:
	# path = ['engenheiros-do-hawaii']
	# path = ['legiao-urbana']
	# path = ['chico-buarque']
	path = ['the-beatles']

	song_paths = []
	for line in content_for_path(path):
	song_paths.extend([m.group('song').strip('/').split('/')
	for m in re.finditer(ELEMSUG, line)])

	with open(get_file_name(path + ['processed']), 'w') as output:
	for song_path in song_paths:
	print 'processing', song_path
	content = ' '.join(content_for_path(song_path))
	soup = BeautifulSoup(content)
	lhtml = soup.find('div', id='div_letra')
	lyrics = ' '.join([str(p) for (i, p) in enumerate(lhtml.contents)
	if i % 2 == 1])
	output.write(fix_lyrics(lyrics) + '\n')
	output.flush()