-
-
Save pl0923/bf76ac46f68ba7ecc7ba624980db3af5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
from bs4 import BeautifulSoup | |
import urllib2 | |
import re | |
import ConfigParser | |
def country(): | |
page = 1 | |
next_flg = True | |
while next_flg: | |
list_url = 'https://ameblo.jp/countrygirls/themeentrylist-10087903784-' + str(page) + '.html' | |
hdr = {'User-Agent': 'Mozilla/5.0'} | |
req = urllib2.Request(list_url, headers=hdr) | |
html = urllib2.urlopen(req) | |
soup = BeautifulSoup(html, "html.parser") | |
#前回DL時の最新URLを読み込む | |
inifile = ConfigParser.SafeConfigParser() | |
inifile.read('./last_update.ini') | |
last_id = inifile.get('country', 'url') | |
print 'read back until: 'last_id, | |
h2s = soup.find_all('h2', attrs={'data-uranus-component':'entryItemTitle'}) | |
urls = [] | |
for h2 in h2s: | |
url = h2.find('a').get('href') | |
urls.append(url) | |
#次回DL用に最新記事のURLを保持 | |
if page == 1: | |
last_url = urls[0] | |
#個別記事にアクセスする | |
for url in urls: | |
#前回DL時の最新URLと一致したら終了 | |
if url == lasturl: | |
next_flg = False | |
break | |
art_req = urllib2.Request(url, headers=hdr) | |
art_html = urllib2.urlopen(art_req) | |
art_soup = BeautifulSoup(art_html, "html.parser") | |
div = art_soup.find('div', class_='skin-entryBody') | |
if div.find_all('a') == []: | |
continue | |
else: | |
links = div.find_all('a') | |
img_urls = [] | |
for a in links: | |
u = a.get('href') | |
if u == None: | |
continue | |
if u.split('.')[-1] == 'jpg': | |
u = u.replace('https', 'http') | |
img_urls.append(u) | |
if img_urls == []: | |
continue | |
date = art_soup.find('time').text[:10] | |
t = art_soup.find('a', attrs={'rel':'bookmark'}).text | |
import re | |
import os | |
import requests | |
#タイトルから(ももち)を削除 | |
t2 = t.replace(u'(ももち)','') | |
#タイトルから記号などを削除 | |
regex = u'[^ぁ-んァ-ン一-龥0-90-9ー]' | |
title = re.sub(regex, '', t2) | |
dir = 'momochi/' | |
filename = date + '_' + title + '_' | |
#画像DL(ツイッターTLのコードと同じ) | |
if not os.path.exists(dir): | |
os.mkdir(dir) | |
for i, img_url in enumerate(img_urls): | |
try: | |
res = requests.get(img_url, allow_redirects=False) | |
image = res.content | |
name = filename + str(i+1) | |
ext = img_url.split('.')[-1] | |
print img_url | |
print ext | |
f = open(dir + name + '.' + ext, 'wb') | |
f.write(image) | |
print img_url, 'succesfully downloaded.' | |
except Exception as e: | |
print '***failed to download: ' , img_url | |
print e | |
pass | |
page_class = len(soup.find('a', attrs={'data-uranus-component':'paginationNext'}).get('class')) | |
if page_class != 3: | |
next_flg = False | |
break | |
page += 1 | |
print 'All media downloaded.' | |
#ここで外部ファイルに書き込む | |
inifile.set('country', 'url', last_url) | |
f = open('./last_update.ini', 'w') | |
inifile.write(f) | |
f.close() | |
if __name__ == '__main__': | |
country() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment