pl0923/fansite4_completeCode.py Secret

## fansite4_completeCode.py
#!/usr/bin/python
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import urllib2
import re
import ConfigParser

def country():
    page = 1
    next_flg = True
    while next_flg:
        list_url = 'https://ameblo.jp/countrygirls/themeentrylist-10087903784-' + str(page) + '.html'
        hdr = {'User-Agent': 'Mozilla/5.0'}
        req = urllib2.Request(list_url, headers=hdr)
        html = urllib2.urlopen(req)
        soup = BeautifulSoup(html, "html.parser")

        #前回DL時の最新URLを読み込む
        inifile = ConfigParser.SafeConfigParser()
        inifile.read('./last_update.ini')
        last_id = inifile.get('country', 'url')
        print 'read back until: 'last_id,

        h2s = soup.find_all('h2', attrs={'data-uranus-component':'entryItemTitle'})
        urls = []
        for h2 in h2s:
            url = h2.find('a').get('href')
            urls.append(url)

        #次回DL用に最新記事のURLを保持
        if page == 1:
            last_url = urls[0]

        #個別記事にアクセスする
        for url in urls:
            #前回DL時の最新URLと一致したら終了
            if url == lasturl:
                next_flg = False
                break

            art_req = urllib2.Request(url, headers=hdr)
            art_html = urllib2.urlopen(art_req)
            art_soup = BeautifulSoup(art_html, "html.parser")

            div = art_soup.find('div', class_='skin-entryBody')

            if div.find_all('a') == []:
                continue
            else:
                links = div.find_all('a')
                img_urls = []
                for a in links:
                    u = a.get('href')
                    if u == None:
                        continue
                    if u.split('.')[-1] == 'jpg':
                        u = u.replace('https', 'http')
                        img_urls.append(u)
                if img_urls == []:
                    continue

            date = art_soup.find('time').text[:10]
            t = art_soup.find('a', attrs={'rel':'bookmark'}).text

            import re
            import os
            import requests
            #タイトルから（ももち）を削除
            t2 = t.replace(u'（ももち）','')
            #タイトルから記号などを削除
            regex = u'[^ぁ-んァ-ン一-龥0-9０-９ー]'
            title = re.sub(regex, '', t2)

            dir = 'momochi/'
            filename = date + '_' + title + '_'

            #画像DL(ツイッターTLのコードと同じ)
            if not os.path.exists(dir):
                os.mkdir(dir)

            for i, img_url in enumerate(img_urls):
                try:
                    res = requests.get(img_url, allow_redirects=False)
                    image = res.content
                    name = filename + str(i+1)
                    ext = img_url.split('.')[-1]
                    print img_url
                    print ext
                    f = open(dir + name + '.' + ext, 'wb')
                    f.write(image)

                    print img_url, 'succesfully downloaded.'
                except Exception as e:
                    print '***failed to download: ' , img_url
                    print e
                    pass

        page_class = len(soup.find('a', attrs={'data-uranus-component':'paginationNext'}).get('class'))
        if page_class != 3:
            next_flg = False
            break

        page += 1

        print 'All media downloaded.'
        #ここで外部ファイルに書き込む
        inifile.set('country', 'url', last_url)
        f = open('./last_update.ini', 'w')
        inifile.write(f)
        f.close()

if __name__ == '__main__':
    country()
	#!/usr/bin/python
	# -- coding: utf-8 --

	from bs4 import BeautifulSoup
	import urllib2
	import re
	import ConfigParser

	def country():
	page = 1
	next_flg = True
	while next_flg:
	list_url = 'https://ameblo.jp/countrygirls/themeentrylist-10087903784-' + str(page) + '.html'
	hdr = {'User-Agent': 'Mozilla/5.0'}
	req = urllib2.Request(list_url, headers=hdr)
	html = urllib2.urlopen(req)
	soup = BeautifulSoup(html, "html.parser")

	#前回DL時の最新URLを読み込む
	inifile = ConfigParser.SafeConfigParser()
	inifile.read('./last_update.ini')
	last_id = inifile.get('country', 'url')
	print 'read back until: 'last_id,

	h2s = soup.find_all('h2', attrs={'data-uranus-component':'entryItemTitle'})
	urls = []
	for h2 in h2s:
	url = h2.find('a').get('href')
	urls.append(url)

	#次回DL用に最新記事のURLを保持
	if page == 1:
	last_url = urls[0]

	#個別記事にアクセスする
	for url in urls:
	#前回DL時の最新URLと一致したら終了
	if url == lasturl:
	next_flg = False
	break

	art_req = urllib2.Request(url, headers=hdr)
	art_html = urllib2.urlopen(art_req)
	art_soup = BeautifulSoup(art_html, "html.parser")

	div = art_soup.find('div', class_='skin-entryBody')

	if div.find_all('a') == []:
	continue
	else:
	links = div.find_all('a')
	img_urls = []
	for a in links:
	u = a.get('href')
	if u == None:
	continue
	if u.split('.')[-1] == 'jpg':
	u = u.replace('https', 'http')
	img_urls.append(u)
	if img_urls == []:
	continue

	date = art_soup.find('time').text[:10]
	t = art_soup.find('a', attrs={'rel':'bookmark'}).text

	import re
	import os
	import requests
	#タイトルから（ももち）を削除
	t2 = t.replace(u'（ももち）','')
	#タイトルから記号などを削除
	regex = u'[^ぁ-んァ-ン一-龥0-9０-９ー]'
	title = re.sub(regex, '', t2)

	dir = 'momochi/'
	filename = date + '_' + title + '_'

	#画像DL(ツイッターTLのコードと同じ)
	if not os.path.exists(dir):
	os.mkdir(dir)

	for i, img_url in enumerate(img_urls):
	try:
	res = requests.get(img_url, allow_redirects=False)
	image = res.content
	name = filename + str(i+1)
	ext = img_url.split('.')[-1]
	print img_url
	print ext
	f = open(dir + name + '.' + ext, 'wb')
	f.write(image)

	print img_url, 'succesfully downloaded.'
	except Exception as e:
	print '***failed to download: ' , img_url
	print e
	pass

	page_class = len(soup.find('a', attrs={'data-uranus-component':'paginationNext'}).get('class'))
	if page_class != 3:
	next_flg = False
	break

	page += 1

	print 'All media downloaded.'
	#ここで外部ファイルに書き込む
	inifile.set('country', 'url', last_url)
	f = open('./last_update.ini', 'w')
	inifile.write(f)
	f.close()

	if __name__ == '__main__':
	country()