Skip to content

Instantly share code, notes, and snippets.

@esehara
Created July 17, 2011 22:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save esehara/1088146 to your computer and use it in GitHub Desktop.
Save esehara/1088146 to your computer and use it in GitHub Desktop.
社会学評論の過去掲載分を一気にダウンロードするスクリプト
# -*- coding:utf-8 -*-
# 社会学評論・過去アーカイヴ Downloader
# http://www.journalarchive.jst.go.jp/japanese/jnltop_ja.php?cdjournal=jsr1950
import urllib
import urllib2
from BeautifulSoup import BeautifulSoup
import re
class System_html(object):
def __init__(self):
self.opener = urllib2.build_opener()
def get_url(self,url):
html = self.opener.open(url).read()
self.mainsoup = BeautifulSoup(html)
def get_item(self):
self.download_data = []
startpage = 0
table_list = str(self.mainsoup.findAll('table',)[3])
entrysoup = BeautifulSoup(table_list)
table_list = entrysoup.findAll('table')
page_check = re.compile(u'社会学評論')
for page,item in enumerate(table_list):
print page
print page_check.match(unicode(item))
if page_check.match(unicode(item)):
startpage = page
print '[DEBUG]Start Page is ' + str(startpage)
for number,html in enumerate(table_list):
if number > startpage:
subsoup = BeautifulSoup(str(html))
get_data = subsoup.findAll('td',{'class':'black'})
if get_data != None and len(get_data) > 1 and subsoup.find('table',) != None:
subhtml = BeautifulSoup(str(table_list[number + 2]))
if len(subhtml.findAll('a')) > 0 and len(subhtml.findAll('a')) > 1:
self.download_data.append({'title': get_data[0].text,
'author': get_data[1].text,
'url':str(subhtml.findAll('a')[1]['href'])})
re_author = re.compile(':\ ')
re_data = re.compile('\/')
for item in self.download_data:
item['author'] = re_author.sub('_',item['author'])
item['author'] = re_data.sub('_',item['author'])
item['title'] = re_data.sub(u'/',item['title'])
print '[DEBUG]' + item['title'] + item['author']
urllib.urlretrieve('http://www.journalarchive.jst.go.jp/japanese/'+item['url'],'./Download/' + item['title'].encode('utf-8') + '__' + item['author'].encode('utf-8') + '.pdf')
class No_List(object):
def __init__(self):
self.urllist = []
opener = urllib2.build_opener()
html = opener.open('http://www.journalarchive.jst.go.jp/japanese/jnltop_ja.php?cdjournal=jsr1950').read()
mainsoup = BeautifulSoup(html)
subsoup = mainsoup.findAll('table')[3]
for i,j in enumerate(subsoup.findAll('a')):
if i > 17:
self.urllist.append('http://www.journalarchive.jst.go.jp/japanese/' + str(j['href']))
def get_item(self):
get_system = System_html()
for url in self.urllist:
print '[DEBUG]This File is Get ==> ' + url
get_system.get_url(url)
get_system.get_item()
def main():
get_socio = No_List()
get_socio.get_item()
if __name__ == '__main__':main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment