Skip to content

Instantly share code, notes, and snippets.

@andlima
Created April 7, 2012 03:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andlima/2324940 to your computer and use it in GitHub Desktop.
Save andlima/2324940 to your computer and use it in GitHub Desktop.
Scrape letras.terra.com.br
"""
A short script that, given an artist, scrapes all his/her/their songs
from 'http://letras.terra.com.br'
Dependence: BeautifulSoup, to parse the lyrics from each song page.
Author: Andre Lima - http://github.com/andlima
Licensed under MIT License
Disclaimer: Sorry for the ugly code and lack of documentation. I'll try
to give it some love later.
"""
import sys
import os
import re
import urllib
from BeautifulSoup import BeautifulSoup
URL_BASE = ['http://letras.terra.com.br']
ELEMSUG = '<a class="elemsug" href="(?P<song>[^"]+)">'
DIV_LETRA = '<div id="div_letra">(?P<lyrics>.+)</div>'
CACHE = ['.', 'cache']
def make_sure_dir_exists(d):
try:
os.mkdir(d)
except OSError:
pass
def get_url(path):
return '/'.join(URL_BASE + path)
def get_file_name(path):
return '/'.join(CACHE + path)
def content_for_path(path):
"""
Receives a `path` represented as a list, like:
- ['chico-buarque']
- ['legiao-urbana', '1234']
Returns its content as a list of strings (lines)
"""
if len(path) == 1:
make_sure_dir_exists(get_file_name(path))
path2 = path + ['root']
else:
make_sure_dir_exists(get_file_name(path[:-1]))
path2 = path
try:
with open(get_file_name(path2), 'r') as f:
lines = [line.strip() for line in f]
except IOError:
x = urllib.urlopen(get_url(path))
lines = x.readlines()
with open(get_file_name(path2), 'w') as f:
f.writelines(lines)
lines = [line.strip() for line in lines]
return lines
def fix_lyrics(lyrics):
substitutions = [
('<p>', '<strophe> <verse> '),
('</p>', ' </verse> </strophe> '),
('<br />', ' </verse> <verse> '),
(' ', ' '),
]
lyrics = '<song> ' + lyrics + ' </song>'
for a, b in substitutions:
lyrics = lyrics.replace(a, b)
return lyrics
if __name__ == '__main__':
make_sure_dir_exists('/'.join(CACHE))
try:
path = [sys.argv[1]]
except IndexError:
# Examples:
# path = ['engenheiros-do-hawaii']
# path = ['legiao-urbana']
# path = ['chico-buarque']
path = ['the-beatles']
song_paths = []
for line in content_for_path(path):
song_paths.extend([m.group('song').strip('/').split('/')
for m in re.finditer(ELEMSUG, line)])
with open(get_file_name(path + ['processed']), 'w') as output:
for song_path in song_paths:
print 'processing', song_path
content = ' '.join(content_for_path(song_path))
soup = BeautifulSoup(content)
lhtml = soup.find('div', id='div_letra')
lyrics = ' '.join([str(p) for (i, p) in enumerate(lhtml.contents)
if i % 2 == 1])
output.write(fix_lyrics(lyrics) + '\n')
output.flush()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment