Created
July 17, 2011 22:12
-
-
Save esehara/1088146 to your computer and use it in GitHub Desktop.
社会学評論の過去掲載分を一気にダウンロードするスクリプト
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding:utf-8 -*- | |
# 社会学評論・過去アーカイヴ Downloader | |
# http://www.journalarchive.jst.go.jp/japanese/jnltop_ja.php?cdjournal=jsr1950 | |
import urllib | |
import urllib2 | |
from BeautifulSoup import BeautifulSoup | |
import re | |
class System_html(object): | |
def __init__(self): | |
self.opener = urllib2.build_opener() | |
def get_url(self,url): | |
html = self.opener.open(url).read() | |
self.mainsoup = BeautifulSoup(html) | |
def get_item(self): | |
self.download_data = [] | |
startpage = 0 | |
table_list = str(self.mainsoup.findAll('table',)[3]) | |
entrysoup = BeautifulSoup(table_list) | |
table_list = entrysoup.findAll('table') | |
page_check = re.compile(u'社会学評論') | |
for page,item in enumerate(table_list): | |
print page | |
print page_check.match(unicode(item)) | |
if page_check.match(unicode(item)): | |
startpage = page | |
print '[DEBUG]Start Page is ' + str(startpage) | |
for number,html in enumerate(table_list): | |
if number > startpage: | |
subsoup = BeautifulSoup(str(html)) | |
get_data = subsoup.findAll('td',{'class':'black'}) | |
if get_data != None and len(get_data) > 1 and subsoup.find('table',) != None: | |
subhtml = BeautifulSoup(str(table_list[number + 2])) | |
if len(subhtml.findAll('a')) > 0 and len(subhtml.findAll('a')) > 1: | |
self.download_data.append({'title': get_data[0].text, | |
'author': get_data[1].text, | |
'url':str(subhtml.findAll('a')[1]['href'])}) | |
re_author = re.compile(':\ ') | |
re_data = re.compile('\/') | |
for item in self.download_data: | |
item['author'] = re_author.sub('_',item['author']) | |
item['author'] = re_data.sub('_',item['author']) | |
item['title'] = re_data.sub(u'/',item['title']) | |
print '[DEBUG]' + item['title'] + item['author'] | |
urllib.urlretrieve('http://www.journalarchive.jst.go.jp/japanese/'+item['url'],'./Download/' + item['title'].encode('utf-8') + '__' + item['author'].encode('utf-8') + '.pdf') | |
class No_List(object): | |
def __init__(self): | |
self.urllist = [] | |
opener = urllib2.build_opener() | |
html = opener.open('http://www.journalarchive.jst.go.jp/japanese/jnltop_ja.php?cdjournal=jsr1950').read() | |
mainsoup = BeautifulSoup(html) | |
subsoup = mainsoup.findAll('table')[3] | |
for i,j in enumerate(subsoup.findAll('a')): | |
if i > 17: | |
self.urllist.append('http://www.journalarchive.jst.go.jp/japanese/' + str(j['href'])) | |
def get_item(self): | |
get_system = System_html() | |
for url in self.urllist: | |
print '[DEBUG]This File is Get ==> ' + url | |
get_system.get_url(url) | |
get_system.get_item() | |
def main(): | |
get_socio = No_List() | |
get_socio.get_item() | |
if __name__ == '__main__':main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment