Created
July 19, 2016 02:30
-
-
Save peace098beat/0eed58bb0fd0e6b0a2da07031d33d6ab to your computer and use it in GitHub Desktop.
BS4スクレイピング
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!coding:utf-8 | |
""" | |
spotify_bs.py | |
https://spotifycharts.com/regionalから小さいサムネイル画像の取得・保存するスクリプト | |
もじBeautifullSoupがなければインストールしてください | |
pip install bs4 # ??? | |
""" | |
import os | |
import urllib | |
import re # 正規表現モジュール | |
import urllib2 | |
from bs4 import BeautifulSoup | |
# 画像保存DIR | |
DIR_NAME_XS = "./XS/" | |
DIR_NAME_M = "./M/" | |
try: | |
os.mkdir("./XS") | |
os.mkdir("./M") | |
except: | |
pass | |
# HTTPアクセス.社内LANからはアクセスできない.FTTHだと接続可能 | |
respons = urllib2.urlopen("https://spotifycharts.com/regional") | |
# HTMLの取得 | |
html = respons.read() | |
print(html) | |
# HTMLをパース | |
soup = BeautifulSoup(html, "html.parser") | |
# ********************************************************* | |
# 小さい画像の取得 | |
# ********************************************************* | |
# chart-table-imageクラスの取得 | |
target = soup.select(".chart-table-image") | |
# chart-table-imageの子供からimgタグを取得 | |
for i, element in enumerate(target): | |
# IMGタグの取得(BeautifulSoupのオブジェクト) | |
img = element.find("img") | |
# 画像のURL | |
img_url = img['src'] | |
# 保存する画像の名前を取得 | |
fname = "XS-"+os.path.basename(img_url)+".jpg" | |
fname = os.path.join(DIR_NAME_XS, fname) | |
# 画像の保存 | |
urllib.urlretrieve(img_url, fname) | |
# 進捗表示 | |
print(fname) | |
progress = "*"*int((float(i+1)/len(target)*100)) | |
print("|%-100s-| %d/%d"%(progress, i+1, len(target))) | |
print("** fin mini image download **") | |
# ********************************************************* | |
# 大画像の取得 | |
# ********************************************************* | |
# chart-table-imageクラスの取得 | |
target = soup.select(".chart-table-image") | |
# chart-table-imageの子供からimgタグを取得 | |
for i, element in enumerate(target): | |
element = target[i] | |
# IMGタグの取得(BeautifulSoupのオブジェクト) | |
a = element.find("a") | |
# ジャンプ先での処理 | |
jump_url = a['href'] | |
print(jump_url) | |
# ジャンプ先のhtmlを取得(おまじない) | |
respons = urllib2.urlopen(jump_url) | |
html = respons.read() | |
soup = BeautifulSoup(html, "html.parser") | |
# div要素からurl("画像URL")画像のURLを取得 | |
div = soup.select(".cover-art-image")[0] | |
# divのstyle属性取得 | |
style = div['style'] | |
# style内から属性を取り出すメソッドはないため,正規表現で文字列抽出 | |
urls = re.findall('url\((.*?)\)', style) # listでカエル["//i.scdn.co/image/b48b1b7e5",] | |
url = urls[0] # //i.scdn.co/image/b48b1b7e56f424818f2e8df537c3b692740f100b | |
url = "http:"+url # 先頭付加 http://i.scdn.co/image/b48b1b7e56f424818f2e8df537c3b692740f100b | |
# 保存する画像の名前を取得 | |
fname = os.path.basename(img_url) | |
# 画像保存 | |
urllib.urlretrieve(img_url, "M-"+fname+".jpg") | |
# 進捗表示 | |
print(url) | |
print(fname) | |
progress = "*"*int((float(i+1)/len(target)*100)) | |
print("|%-100s-| %d/%d"%(progress, i+1, len(target))) | |
print("** fin image download **") | |
print("** end **") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment