Skip to content

Instantly share code, notes, and snippets.

@mr-karan
Created June 27, 2015 12:36
Show Gist options
  • Save mr-karan/c5a8ac3847c893e3f6e1 to your computer and use it in GitHub Desktop.
Save mr-karan/c5a8ac3847c893e3f6e1 to your computer and use it in GitHub Desktop.
iPython Scraping
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from urllib.parse import urljoin\n",
"from bs4 import BeautifulSoup,SoupStrainer\n",
"import requests\n",
"import time\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"BASE_URL = \"http://www.azlyrics.com\"\n",
"artist_url = \"http://www.azlyrics.com/p/pinkfloyd.html\"\n",
"#from selenium import webdriver\n",
"#from selenium.webdriver.common.keys import Keys\n",
"#browser = webdriver.Firefox()\n",
"#browser.get(artist_url)\n",
"#elem=browser.find_element_by_tag_name(\"body\")\n",
"#htmlfile=browser.page_source\n",
"#soupAllSongs=BeautifulSoup(htmlfile,\"lxml\")\n",
"#print(soupAllLinksParse)\n",
"#pagedown=300\n",
"#while pagedown:\n",
" # elem.send_keys(Keys.DOWN)\n",
" # time.sleep(0.1)\n",
" # pagedown=pagedown-1\n",
" #print (pagedown)\n",
"#post_elems=browser.find_element_by_class_name(\"song_title\")\n",
"#I used Selenium to scroll till bottom (had infinite scroll)\n",
"# and saved the result in a html file Commented part in the code is of scraping.\n",
"'''\n",
"html=open('pfsongslist.html').read()\n",
"songsList = SoupStrainer('section',{'class': 'all_songs'})\n",
"souplyrics=SoupStrainer('div',{'class':'lyrics'})\n",
"songsLink=[]\n",
"songLyrics=[]\n",
"#songdetails={}\n",
"count=0\n",
"listOfSongs = BeautifulSoup(html,parse_only=songsList)\n",
"for a in listOfSongs.findAll('a',{'class':'song_link' }):\n",
" songsLink.append(a['href'])\n",
" #print (a['title'])\n",
"for i in songsLink:\n",
" response=requests.get(i)\n",
" lyrics=BeautifulSoup(response.text,parse_only=souplyrics)\n",
" for p in lyrics.findAll('p'):\n",
" songLyrics.append(p.text)\n",
" count=count+1\n",
" print(\"At Song %d . Remaining : %d \"%(count,len(songsLink)-count))\n",
" time.sleep(5)\n",
"'''\n",
"\n",
"'''Above method was using Genius.com for scraping but the requests were taking a lot of time\n",
"and I got connection timeout after several requests, so it wasn't practically possible to use\n",
"it. So I shifted over to a lightweight more cleaner lyrics source, azlyrics.com\n",
"Also, I tried using Musixmatch API but felt that scraping is more suitable for this project.\n",
"Though that service is nice and I will consider it in my future projects'''\n",
"\n",
"htmlfile=open('pfaz.html').read()\n",
"soupAllLinks=SoupStrainer('a',{'target': '_blank'})\n",
"\n",
"soupAllLinksParse=BeautifulSoup(htmlfile,parse_only=soupAllLinks)\n",
"\n",
"songLinks=[]\n",
"songLyrics=[]\n",
"count=0\n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment