Created
June 27, 2015 12:36
-
-
Save mr-karan/c5a8ac3847c893e3f6e1 to your computer and use it in GitHub Desktop.
iPython Scraping
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from urllib.parse import urljoin\n", | |
"from bs4 import BeautifulSoup,SoupStrainer\n", | |
"import requests\n", | |
"import time\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"BASE_URL = \"http://www.azlyrics.com\"\n", | |
"artist_url = \"http://www.azlyrics.com/p/pinkfloyd.html\"\n", | |
"#from selenium import webdriver\n", | |
"#from selenium.webdriver.common.keys import Keys\n", | |
"#browser = webdriver.Firefox()\n", | |
"#browser.get(artist_url)\n", | |
"#elem=browser.find_element_by_tag_name(\"body\")\n", | |
"#htmlfile=browser.page_source\n", | |
"#soupAllSongs=BeautifulSoup(htmlfile,\"lxml\")\n", | |
"#print(soupAllLinksParse)\n", | |
"#pagedown=300\n", | |
"#while pagedown:\n", | |
" # elem.send_keys(Keys.DOWN)\n", | |
" # time.sleep(0.1)\n", | |
" # pagedown=pagedown-1\n", | |
" #print (pagedown)\n", | |
"#post_elems=browser.find_element_by_class_name(\"song_title\")\n", | |
"#I used Selenium to scroll till bottom (had infinite scroll)\n", | |
"# and saved the result in a html file Commented part in the code is of scraping.\n", | |
"'''\n", | |
"html=open('pfsongslist.html').read()\n", | |
"songsList = SoupStrainer('section',{'class': 'all_songs'})\n", | |
"souplyrics=SoupStrainer('div',{'class':'lyrics'})\n", | |
"songsLink=[]\n", | |
"songLyrics=[]\n", | |
"#songdetails={}\n", | |
"count=0\n", | |
"listOfSongs = BeautifulSoup(html,parse_only=songsList)\n", | |
"for a in listOfSongs.findAll('a',{'class':'song_link' }):\n", | |
" songsLink.append(a['href'])\n", | |
" #print (a['title'])\n", | |
"for i in songsLink:\n", | |
" response=requests.get(i)\n", | |
" lyrics=BeautifulSoup(response.text,parse_only=souplyrics)\n", | |
" for p in lyrics.findAll('p'):\n", | |
" songLyrics.append(p.text)\n", | |
" count=count+1\n", | |
" print(\"At Song %d . Remaining : %d \"%(count,len(songsLink)-count))\n", | |
" time.sleep(5)\n", | |
"'''\n", | |
"\n", | |
"'''Above method was using Genius.com for scraping but the requests were taking a lot of time\n", | |
"and I got connection timeout after several requests, so it wasn't practically possible to use\n", | |
"it. So I shifted over to a lightweight more cleaner lyrics source, azlyrics.com\n", | |
"Also, I tried using Musixmatch API but felt that scraping is more suitable for this project.\n", | |
"Though that service is nice and I will consider it in my future projects'''\n", | |
"\n", | |
"htmlfile=open('pfaz.html').read()\n", | |
"soupAllLinks=SoupStrainer('a',{'target': '_blank'})\n", | |
"\n", | |
"soupAllLinksParse=BeautifulSoup(htmlfile,parse_only=soupAllLinks)\n", | |
"\n", | |
"songLinks=[]\n", | |
"songLyrics=[]\n", | |
"count=0\n", | |
"\n", | |
"\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.4.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment