Skip to content

Instantly share code, notes, and snippets.

@pl0923
Created April 3, 2018 09:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pl0923/bf76ac46f68ba7ecc7ba624980db3af5 to your computer and use it in GitHub Desktop.
Save pl0923/bf76ac46f68ba7ecc7ba624980db3af5 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import urllib2
import re
import ConfigParser
def country():
page = 1
next_flg = True
while next_flg:
list_url = 'https://ameblo.jp/countrygirls/themeentrylist-10087903784-' + str(page) + '.html'
hdr = {'User-Agent': 'Mozilla/5.0'}
req = urllib2.Request(list_url, headers=hdr)
html = urllib2.urlopen(req)
soup = BeautifulSoup(html, "html.parser")
#前回DL時の最新URLを読み込む
inifile = ConfigParser.SafeConfigParser()
inifile.read('./last_update.ini')
last_id = inifile.get('country', 'url')
print 'read back until: 'last_id,
h2s = soup.find_all('h2', attrs={'data-uranus-component':'entryItemTitle'})
urls = []
for h2 in h2s:
url = h2.find('a').get('href')
urls.append(url)
#次回DL用に最新記事のURLを保持
if page == 1:
last_url = urls[0]
#個別記事にアクセスする
for url in urls:
#前回DL時の最新URLと一致したら終了
if url == lasturl:
next_flg = False
break
art_req = urllib2.Request(url, headers=hdr)
art_html = urllib2.urlopen(art_req)
art_soup = BeautifulSoup(art_html, "html.parser")
div = art_soup.find('div', class_='skin-entryBody')
if div.find_all('a') == []:
continue
else:
links = div.find_all('a')
img_urls = []
for a in links:
u = a.get('href')
if u == None:
continue
if u.split('.')[-1] == 'jpg':
u = u.replace('https', 'http')
img_urls.append(u)
if img_urls == []:
continue
date = art_soup.find('time').text[:10]
t = art_soup.find('a', attrs={'rel':'bookmark'}).text
import re
import os
import requests
#タイトルから(ももち)を削除
t2 = t.replace(u'(ももち)','')
#タイトルから記号などを削除
regex = u'[^ぁ-んァ-ン一-龥0-90-9ー]'
title = re.sub(regex, '', t2)
dir = 'momochi/'
filename = date + '_' + title + '_'
#画像DL(ツイッターTLのコードと同じ)
if not os.path.exists(dir):
os.mkdir(dir)
for i, img_url in enumerate(img_urls):
try:
res = requests.get(img_url, allow_redirects=False)
image = res.content
name = filename + str(i+1)
ext = img_url.split('.')[-1]
print img_url
print ext
f = open(dir + name + '.' + ext, 'wb')
f.write(image)
print img_url, 'succesfully downloaded.'
except Exception as e:
print '***failed to download: ' , img_url
print e
pass
page_class = len(soup.find('a', attrs={'data-uranus-component':'paginationNext'}).get('class'))
if page_class != 3:
next_flg = False
break
page += 1
print 'All media downloaded.'
#ここで外部ファイルに書き込む
inifile.set('country', 'url', last_url)
f = open('./last_update.ini', 'w')
inifile.write(f)
f.close()
if __name__ == '__main__':
country()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment