Last active
August 29, 2015 14:26
-
-
Save abhi18av/2488af1734388d2323e0 to your computer and use it in GitHub Desktop.
Remember the Songs!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# I have left an ample amount of trail and rough work in here so that you may follow up the workflow | |
# and modify the program as necessary for your needs. | |
""" | |
Disclaimer :- I am not responsible for any damage done via this code or it's derivative. | |
Advice :- Be careful with the sites you play with. Be nice and don't wander away from the path of light! | |
Most Muggles don't like Magic;P | |
""" | |
################# VERSION 1 | |
""" | |
The goal is to fetch all the song names and links from the website | |
<< http://lyricstranslate.com/en/laura-naerhi-lyrics.html >> | |
and then using those links to navigate over there and to fetch the lyrics. | |
""" | |
## Start by isolating the names of the songs from the list | |
import requests | |
from bs4 import BeautifulSoup | |
url = "http://lyricstranslate.com/en/laura-naerhi-lyrics.html" | |
r = requests.get(url) | |
soup = BeautifulSoup(r.content) | |
print(soup.prettify) | |
songnames = soup.find_all('td', class_='songName') | |
for i in range(5): | |
print(songnames[i],"\n") | |
songnames[0] | |
songnames[0].a | |
songnames[0].a.string #songnames[0].a.text | |
songnames[0].a["href"] | |
############################## VERSION 2 | |
""" | |
Here we aim to isoalate 3 things | |
> Singer | |
> Song name | |
> Song link | |
""" | |
soup.title.text | |
for i in range(len(songnames)): | |
print(songnames[i].a.string, " --->> ", songnames[0].a["href"], "\n") | |
############## VERSION 3 | |
""" | |
Here we wish to isolate only the songs | |
> title | |
> lyrics | |
""" | |
song_url = "http://lyricstranslate.com" + songnames[4].a["href"] | |
print(song_url) | |
r_song = requests.get(song_url) | |
song_soup = BeautifulSoup(r_song.content) | |
print(song_soup.prettify) | |
#lyrics = song_soup.find('div', class_ ='title-h2') | |
## WE USE A DIFFERENT NOTATION FOR CLASS | |
lyrics = song_soup.find('div', {"class": "song-node-text"}) | |
print(lyrics.text) | |
for i in range(len(list(lyrics.children))): | |
print(list(lyrics.children)[i],"\n") | |
""" | |
some work still to be done in version 3 | |
""" | |
song_lyrics = lyrics.find_all("p") | |
song_lyrics[1] | |
for para in song_lyrics: | |
print(para) | |
only_para = list(lyrics.children)[1:18] | |
only_para | |
################ VERSION 4 | |
""" | |
The goal is to save the lyrics in a file with utf-8 encoding | |
in the songs&poetry directory | |
""" | |
to_be_saved = str(song_soup.title) + "\n" + str(lyrics.text) | |
import os | |
writepath = 'W:\songs&poetry\file1.txt' | |
mode = 'a' if os.path.exists(writepath) else 'w' | |
with open(writepath, mode, encoding = "utf-8") as f: | |
# f.write(lyrics.text) # saves only the main lyrics | |
f.write(to_be_saved) | |
############################## VERSION 5 | |
""" | |
Now we fetch the lyrics of all the songs listed on the index page | |
""" | |
import requests | |
from bs4 import BeautifulSoup | |
url = "http://lyricstranslate.com/en/laura-naerhi-lyrics.html" | |
r = requests.get(url) | |
soup = BeautifulSoup(r.content) | |
songnames = soup.find_all('td', class_='songName') | |
import os | |
writepath = 'W:\\songs&poetry\\finnish\\songs\\laura-narhi\\songs.txt' | |
for i in range(len(songnames)): | |
song_url = "http://lyricstranslate.com" + songnames[i].a["href"] | |
r_song = requests.get(song_url) | |
song_soup = BeautifulSoup(r_song.content) | |
lyrics = song_soup.find('div', {"class": "song-node-text"}) | |
#for i in range(len(list(lyrics.children))): | |
# print(list(lyrics.children)[i],"\n") | |
to_be_saved = str(song_soup.title) + "\n" + str(lyrics.text) | |
mode = 'a' if os.path.exists(writepath) else 'w' | |
with open(writepath, mode, encoding = "utf-8") as f: | |
f.write(to_be_saved) | |
# end of For loop | |
####################### | |
soup.contents[1].contents[3] | |
songnames[0].contents | |
songnames[0].contents[0] | |
songnames[0].contents[0].text | |
# | |
#for university in universities: | |
# print(university['href']+","+university.string) | |
for name in songnames: | |
print(name) | |
# print(university.string, " <- - -> ", university['href'] ) | |
ls = list(soup.children) | |
for i in range(3): | |
print(ls[i]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment