Skip to content

Instantly share code, notes, and snippets.

@kurubushi--rm
Last active February 18, 2023 02:23
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save kurubushi--rm/6df37bbc7d0a9f4356e299a2f683a1a6 to your computer and use it in GitHub Desktop.
Save kurubushi--rm/6df37bbc7d0a9f4356e299a2f683a1a6 to your computer and use it in GitHub Desktop.
# coding: utf-8
# # MonkeyBench : an interactive environment for library research
#
#
# MonkeyBenchは、Jupyterの上に構築中のライブラリーリサーチのための対話環境です。
#
# ライブラリーリサーチの過程と成果を、そのためのツールの作成と利用込みで、そのまま保存/共有/再利用できるという利点を活かし、調査プロセスの振り返りから質の向上、ノウハウ化を支援する環境となることを目論んでいます。
#
# ノイラートの船ではありませんが、「航海しながら改造し続ける」というコンセプトのため、完成することはありませんが、ツールの寄せ集めという性質上、それぞれ一部分でも利用可能です。
# # コマンド一覧とワークフロー
#
# ## 予備調査
#
# | マジックコマンド | 機能 |
# |:-----------|:------------|
# | %dictj (キーワード)  |  キーワードについて手持ちの辞書(EPWING)とネット上の日本語事典をまとめて検索し、記述の短い順から並べて表示 |
# | %refcase (キーワード) | レファレンス共同データベースを検索して、レファレンス事例を収集して表示 |
# | %thememap (キーワード) | キーワードについてウィキペディアのカテゴリー情報を元に、該当する分野の階層図を作成する |
# | %suggestmap (キーワード) | キーワードについてgoogleサジェスト機能を使って、検索で共に使われる共起語を集めてマップ化する |
# | %kotobankmap (キーワード) | kotobankの関連キーワードをつかって関連図をつくる |
# | %webliomap (キーワード) | weblioの関連キーワードをつかって関連図をつくる |
#
# ## 文献調査
#
# | マジックコマンド | 機能 |
# |:-----------|:------------|
# |%webcat (キーワード)|  webcat plus minusを使って書籍等を検索して、結果をデータフレームにする|
# | %ndl (キーワード)| 国会図書館サーチを検索して、結果をデータフレームにする<br>書籍、雑誌記事だけでなく、デジタル資料やレファレンス事例などを含めて検索できるが、上限500件の制限がある|
# |%amazonsimi (キーワード)| アマゾンの類似商品情報を辿って、関連書を集める|
# |%amazonmap (キーワード)| アマゾンの類似商品情報を辿って集めた関連書を、関連チャートに図化する|
# |%amazonPmap (キーワード)| アマゾンの類似商品情報を辿って集めた関連書を、商品写真を使って関連チャートに図化する|
#
#
# ## 文献調査補助
#
# | マジックコマンド | 機能 |
# |:-----------|:------------|
# | %extbook (データフレーム)|  データフレーム内に含まれる書名を抜き出してデータフレーム化する<br> 事典検索やレファレンス事例の結果から書名を抜き出すのに用いる|
# | %%text2bib <br>(複数行の書誌データ)| テキストから書誌情報を拾ってデータフレーム化する<br> コピペや手入力で文献を入力するのに用いる|
# | %makebib (データフレーム)(データフレーム)……|  文献調査の各コマンドが出力した複数のデータフレームをマージして重複を除き、詳細情報を取得してデータフレーム化する|
# |%amazonreviews (書誌データフレーム)|データフレーム内に含まれる目次情報を展開する|
# |%amazonreviews (書誌データフレーム)|データフレーム内に含まれる書籍についてamazonレビューを収集する|
#
# ## 所在調査
#
# | マジックコマンド | 機能 |
# |:-----------|:------------|
# | %librarylist (地名、駅名など)|  地名、駅名など(コンマ区切りで複数指定可能)を入力すると、該当する/付近の図書館のリストをデータフレームにする<br>次の%librarystatueで検索対象図書館を指定するのに使う|
# | %librarystatue (書誌データフレーム)(図書館リストデータフレーム)|  書誌データフレームと図書館リストデータフレームを受けて、各書籍について各図書館の所蔵/貸出情報/予約用URLを取得してデータフレームする|
# | %stock_amazon (書誌データフレーム)|  書誌データフレームを受けて、各書籍についてAmazonの在庫、新刊・中古の最低価格を取得してデータフレームする|
# | %stock_kosho (書誌データフレーム)|  書誌データフレームを受けて、各書籍について「日本の古本屋」の在庫、出品店・価格等を取得してデータフレームする|
# | %stock_ndl (書誌データフレーム)|  書誌データフレームを受けて、各書籍について国会図書館、各県立図書館の所蔵/貸出情報/予約用URLを取得してデータフレームする|
#
#
#
# <img src="http://blog-imgs-90.fc2.com/r/e/a/readingmonkey/MonkeyBench.png">
# # コード
# In[1]:
#standard library modules
from collections import OrderedDict
import io
import re
import subprocess
import time
import urllib
from urllib.error import HTTPError
#IPython modules
from IPython.core.magic import (Magics, magics_class, line_magic, cell_magic, line_cell_magic)
from IPython.core.magic import (register_line_magic, register_cell_magic)
from IPython.display import display
from IPython.display import HTML
#third-party modules
from bottlenose import api
from bs4 import BeautifulSoup
import lxml.html
import pandas as pd
#import pygraphviz as pgv
from graphviz import Digraph
from graphviz import Source
import requests
import wikipedia
# In[2]:
#key関係
AMAZON_ACCESS_KEY_ID="XXXXXXXXXXXXXXXXXXXX"
AMAZON_SECRET_KEY="xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx+x+xxxxx"
AMAZON_ASSOC_TAG="xxxxxxxxxxxxx-99"
calil_app_key = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
# In[3]:
#自分の行きつけの図書館
my_lib_string='''
systemid,systemname,formal,address,tel,geocode,category,url_pc
Osaka_Osaka,大阪府大阪市,大阪市立中央図書館,大阪府大阪市西区北堀江4-3-2,06-6539-3300,"135.486381,34.673879",MEDIUM,http://www.oml.city.osaka.lg.jp/
'''
sio = io.StringIO(my_lib_string)
MYLIBRARY = pd.read_csv(sio)
# ## 汎用ツール系
# In[4]:
#結果の表示援助
def pp(df, now_index, now_column='description'):
if now_column == 'url':
return HTML(df[now_column][now_index])
else:
return HTML(df[now_column][now_index].replace('\n','<br>'))
def url(df, now_index):
if 'url' in df.columns:
return HTML(df['url'][now_index])
# def p(df):
# for now_column in df.columns:
# print now_column + '\t',
# print
# print ' --- '
# datanum, fieldnum = df.shape
# for i in range(0, datanum):
# for now_column in df.columns:
# print str(df[now_column].iat[i]).strip().encode('utf-8')+ '\t',
# print
# print ' --- '
#データフレームをHTML埋め込み
def embed_df(df):
datanum, fieldnum = df.shape
now_html = '<table><th></th>'
for now_column in df.columns:
now_html += '<th>' +str(now_column)+'</th>'
for i in range(0, datanum):
now_html +='<tr><th>{}</th>'.format(i+1)
for now_column in df.columns:
now_item = df[now_column].iat[i]
#print now_item, type(now_item)
if 'url' in now_column and 'http' in str(now_item):
now_html += '<td><a href="{0}" target="_blank">LINK</a></td>'.format(now_item)
else:
now_html += '<td>' +str(now_item) +'</td>'
now_html +='</tr>'
now_html +='</table>'
return HTML(now_html)
@register_cell_magic
def csv2df(line, cell):
# We create a string buffer containing the
# contents of the cell.
sio = io.StringIO(cell)
# We use pandas' read_csv function to parse
# the CSV string.
return pd.read_csv(sio)
# In[5]:
#『書名』を切り出してリストで返す
def extract_titles_from_text(text, recom):
r = re.compile(recom)
text = str(text)
books= r.findall(text)
if books:
return books
else:
return None
def extract_book_titles_from_text(text):
r = re.compile('『([^』]+)』')
text = str(text)
books= r.findall(text)
if books:
return books
else:
return None
def extract_engbook_titles_from_text(text):
r = re.compile(r'<em>([A-Z][^<]+)</em>')
text = str(text)
books= r.findall(text)
if books:
return books
else:
return None
def extract_eng_article_titles_from_text(text):
r = re.compile('“([A-Z][^”]+),”')
text = str(text)
books= r.findall(text)
if books:
return books
else:
return None
# データフレームから『書名』を抜き出す
def extract_titles_from_df(df, recom):
result_list = []
datanum, fieldnum = df.shape
for i in range( datanum):
for j in range(fieldnum):
title_list = extract_titles_from_text(df.iloc[i,j],recom)
if title_list:
for title in title_list:
if title not in result_list:
result_list.append(title)
return result_list
def ext_book(df):
result_df = pd.DataFrame(columns =['title'])
extract_method = {'本':'『([^』]+)』','Book':r'<em>([A-Z][^<]+)</em>', 'Article':'“([A-Z][^”]+),”'}
for category, recom in extract_method.items():
for now_title in extract_titles_from_df(df, recom):
result_dict = OrderedDict()
result_dict['category'] = category,
result_dict['title'] = now_title,
result_df = result_df.append(pd.DataFrame(result_dict))
result_df.index = range(1, len(result_df)+1) #最後にインデックスつけ
return result_df
# In[6]:
def ext_amazon(url):
'''
urlで指定されるページからamazon.co.jpへのリンクを抽出してtitle, asinのデータフレームを返す
'''
r = requests.get(url)
soup = BeautifulSoup(r.text.encode(r.encoding),'lxml')
amazon_items = soup.findAll('a', {'href':re.compile(r'http://www.amazon.co.jp.+')})
result_dict = OrderedDict()
result_dict['title'] = [a.text for a in amazon_items]
result_dict['asin'] = [re.search(r'[0-9X]{10,10}',a.attrs['href']).group() for a in amazon_items]
return pd.DataFrame(result_dict)
# ## Amazon関係
# In[7]:
amazon = api.Amazon(AMAZON_ACCESS_KEY_ID, AMAZON_SECRET_KEY, AMAZON_ASSOC_TAG, Region="JP")
def error_handler(err):
ex = err['exception']
if isinstance(ex, HTTPError) and ex.code == 503:
time.sleep(random.expovariate(0.1))
return True
def get_totalpages(keywords, search_index="All"):
response = amazon.ItemSearch(
SearchIndex=search_index,
Keywords=keywords,
ResponseGroup="ItemIds",
ErrorHandler=error_handler)
soup=BeautifulSoup(response, "lxml")
# print soup
totalpages=int(soup.find('totalpages').text)
return totalpages
def item_search(keywords, search_index="All", item_page=1):
try:
response = amazon.ItemSearch(
SearchIndex=search_index,
Keywords=keywords,
ItemPage=item_page,
ResponseGroup="Large",
ErrorHandler=error_handler)
# バイト配列をunicodeの文字列に
# コレをしないと日本語は文字化けします。
#u_response = response.decode('utf-8','strict')
soup = BeautifulSoup(response, "lxml")
return soup.findAll('item')
except:
return None
def item_title_book_search(title, search_index="Books", item_page=1):
try:
response = amazon.ItemSearch(
SearchIndex=search_index,
Title=title,
ItemPage=item_page,
ResponseGroup="Large",
ErrorHandler=error_handler)
# バイト配列をunicodeの文字列に
# コレをしないと日本語は文字化けします。
#u_response = response.decode('utf-8','strict')
soup = BeautifulSoup(response, "lxml")
return soup.findAll('item')
except:
return None
# ## 文献調査
# In[8]:
class Searcher(object):
"""
抽象クラス 複数のリソースを始めとする調べて結果を合わせて並べ替えて出力
入力ーキーワード self.keyword
出力ーデータフレーム(項目名、リソース名、内容、記述量など)
クラスプロパティ resource_dict={name : path } リソースのリスト=リソース名とアクセス情報(URLや辞書をのパスなど)
"""
#クラスプロパティ
#リソースのリスト=リソース名とアクセス情報(URLや辞書をのパスなど)
resource_dict={}
def __init__(self, keyword):
self.keyword = keyword
def collector(self):
result_df = pd.DataFrame () #入れ物を用意する
resource_list = self.make_resource_list()
for resource in resource_list: #リソースリストから一つずつ取り出すループ
print ('.', end='')
result_df = result_df.append( self.fetch_data(resource) ) #キーワードについてそのリソースからデータを得て、入れ物に追加
result_df = self.arrange(result_df) #ループ終わったら入れ物の中身をソート
return result_df #入れ物の中身を返す
# ## 辞書集め(epwing)
# In[9]:
class epwing_info(Searcher):
dict_root = '/Users/kuru/Documents/EPWING/'
resource_dict={
'ブリタニカ小項目版' : ['ブリタニカ国際大百科事典.Windows対応小項目版.2000','-t+'],
'マグローヒル科学技術用語大辞典' : ['マグローヒル科学技術用語大辞典.第3版','-t+'],
'医学大辞典' : ['医歯薬\ 医学大辞典','-t+'],
'岩波日本史辞典' : ['岩波\ 日本史辞典-ebzip','-t+'],
'理化学辞典第五版ebzip' : ['岩波\ 理化学辞典第五版ebzip','-t+'],
'広辞苑' : ['岩波.広辞苑.第六版','-t+'],
'岩波ケンブリッジ世界人名辞典' : ['岩波\=ケンブリッジ世界人名辞典ebzip','-t+'],
'岩波生物学辞典' : ['岩波生物学辞典第四版','-t+'],
'建築事典' : ['建築事典ebzip','-t+'],
'リーダーズ' : ['研究社\ リーダーズプラス\ V2ebzip','-t+'],
'大辞林' : ['三省堂大辞林','-t+'],
'参考図書2.4万冊' : ['参考図書2.4万冊','-t+'],
'参考調査便覧' : ['参考調査便覧epwing','-tmax'],
'国語大辞典' : ['小学館\ 国語大辞典ebzip','-t+'],
'日本大百科全書' : ['小学館\ 日本大百科全書ebzip','-t+'],
'ランダムハウス英語辞典' : ['小学館.ランダムハウス英語辞典','-t+'],
'大辞泉' : ['小学館.大辞泉','-t+'],
'心理学用語辞典' : ['心理学用語辞典','-t+'],
'人物レファレンス事典' : ['人物レファレンス事典\ 日本編-ebzip','-t+'],
'南山堂医学大辞典' : ['南山堂医学大辞典第18版','-t+'],
'科学技術45万語対訳辞典' : ['日外アソシエーツ\ 科学技術45万語対訳辞典\ 英和/和英','-t+'],
'物語要素事典' : ['物語要素事典epwing','-t+'],
'世界大百科事典' : ['平凡社\ 世界大百科事典\ 第二版','-t+'],
'有斐閣経済辞典' : ['有斐閣-経済辞典-第3版-ebzip','-t+']}
def make_resource_list(self):
return self.resource_dict.keys()
def make_query(self, resource_name):
dict_info = self.resource_dict[resource_name]
path = self.dict_root + dict_info[0]
search_mode = dict_info[1]
# print 'ebmini -b{dict_path} -KIu -KOu -n1 {mode} -l- -s{keyword}'.format(dict_path=path, mode=search_mode, keyword=self.keyword)
return 'ebmini -b{dict_path} -KIu -KOu -n1 {mode} -l- -s{keyword}'.format(dict_path=path, mode=search_mode, keyword=self.keyword)
def fetch_data(self, resource_name):
result = subprocess.getstatusoutput(self.make_query(resource_name))[1]
#print (result)
result_dict = OrderedDict()
result_dict['item_title'] = result.split('\n')[0],
result_dict['dict_name'] = resource_name,
result_dict['description'] = result,
result_dict['des_size'] = len(result),
return pd.DataFrame(result_dict)
def arrange(self, df):
if not df.empty:
df = df[df.des_size >0] #長さゼロは抜く
df = df[df.item_title !='Error: eb_bind() failed.'] #失敗は抜く
df = df.sort_values(by='des_size') #並べ替え
df.index = range(1, len(df)+1) #最後にインデックスつけ
return df
# In[10]:
#参考調査便覧から書誌データフレームを作る
class Refhand_info(epwing_info):
dict_root = '/Users/kuru/Documents/EPWING/'
resource_dict={
'参考調査便覧' : ['参考調査便覧epwing','-mw,c -tmax'],
}
@register_line_magic
def refhand(line):
e = Refhand_info(line)
df = e.collector()
if len(df) >0:
return parse_refhand(df['description'].iat[0])
else:
print ('not found')
def get_bookdata(i, items):
#print len(items), i
if len(items)>=abs(i) and items[i]:
return items[i]
else:
return ''
category_data = '''【1】,解題・研究案内・研究史
【2】,書誌・著作目録・物品標本目録
【3】,巻末・章末参考文献・系図
【4】,年譜・年表
【5】,年鑑・白書・統計書
【6】,人名・機関名・人名辞典
【7】,辞典・事典・索引
【8】,総覧・便覧・六法
【9】,雑誌目録・総目次・総索引
【10】,AV資料・写真・図版・地図'''
category_dict = {line.split(',')[0] : line.split(',')[1] for line in category_data.split('\n')}
def parse_refhand(strings):
df = pd.DataFrame()
#print strings
for line in strings.split('\n')[1:]:
iscategory = re.search('^【[0-9・]+】',line)
if iscategory:
now_category = category_dict.get(iscategory.group())
if not now_category:
#【4・5】のような複合もの
now_category = ''
for i in re.findall(r'[0-9]',iscategory.group()):
now_category += category_dict.get('【{}】'.format(i)) + '/'
#print books
books = re.sub('^【[0-9・]+】', '', line)
for book in books.split('◆'):
#print 'book',book
items = re.compile('[ \s]+').split(book.strip())
result = OrderedDict()
#result['size'] = len (items),
if len(items) < 3:
result['title'] = get_bookdata(0, items),
result['author'] = '',
else:
result['title'] = ' '.join([get_bookdata(i, items) for i in range(len(items)-2)]),
result['author'] = get_bookdata(-2, items),
publisher_issued= get_bookdata(-1, items)
result['publisher'] = re.sub(r'[0-9\-]{4,7}', '', publisher_issued)
m = re.search(r'[0-9\-]{4,7}',publisher_issued)
if m:
result['issued'] = m.group()
else:
result['issued'] = ''
result['type'] = now_category,
df = df.append(pd.DataFrame(result))
df.index = range(1, len(df)+1)
return df
def parse_refhand_test(strings):
df = pd.DataFrame()
#print strings
for line in strings.split('\n')[1:]:
iscategory = re.search('^【[0-9・]+】',line)
if iscategory:
#print books
books = re.sub('^【[0-9・]+】', '', line)
for book in books.split('◆'):
#print 'book',book
result = OrderedDict()
result['book'] = book,
items = re.compile('[ \s]+').split(book)
result['size'] = len (items),
for i in range(len (items)-1, -len (items),-1):
#print i,
if items[i]:
result[str(i)] = items[i],
else:
result[str(i)] ='',
df = df.append(pd.DataFrame(result))
return df
# ## コトバンク
# In[11]:
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'}
class Kotobank_info(Searcher):
def make_resource_list(self):
'''
yahoo辞書でキーワードを拾い直して、それらのキーワードそれぞれにURLを作って、リストで返す
'''
url = 'http://dic.search.yahoo.co.jp/search?p=' + self.keyword + '&stype=exact&aq=-1&oq=&ei=UTF-8'
content = requests.get(url, headers=headers).text
soup = BeautifulSoup(content, "lxml")
#<h3><a href="・・・・">駄洒落</a></h3>
url_list = []
h3_soups = soup.findAll('h3')
if h3_soups:
for h3_soup in h3_soups:
url = 'https://kotobank.jp/word/' + urllib.parse.quote(h3_soup.text)
if url not in url_list:
url_list.append(url)
return url_list
def fetch_data(self, url):
'''
URLを開いてパースして拾い出す
'''
content = requests.get(url, headers=headers).text
soup = BeautifulSoup(content, "lxml")
article_soups = soup.findAll('article')
result_df = pd.DataFrame()
if article_soups:
#ダイレクトで見つかった場合
for article_soup in article_soups:
# <h2>◯◯辞典<span>の解説</span></h2>を見つけて
dict_name = article_soup.find('h2').text
dict_name = dict_name.replace('の解説','')
# <h3>項目/h3>を見つけて
item_title_soup = article_soup.find('h3')
item_title = item_title_soup.text.replace('\n','').strip()
# <section class=description>を見つけて
description_soup = article_soup.find('section', class_='description')
description = description_soup.text
description = description.replace('\t','').strip()
description = description.replace('\n','').strip()
description = description.replace(' ','').strip()
result_dict = OrderedDict()
result_dict['item_title'] = item_title,
result_dict['dict_name'] = dict_name,
result_dict['description'] = description,
result_dict['des_size'] = len(description),
result_dict['url'] = url,
newdf = pd.DataFrame(result_dict)
result_df = result_df.append(newdf)
return result_df
else:
return None
def arrange(self, df):
#print df
if not df.empty:
df = df[df.des_size >0] #長さゼロは抜く
df = df.sort_values(by='des_size') #並べ替え
df.index = range(1, len(df)+1) #最後にインデックスつけ
return df
# ## Weblio
# In[12]:
def weblio(keyword):
#keyword = 'ニューラルネットワーク'
url = 'http://www.weblio.jp/content/'+keyword
r = requests.get( url)
tree = lxml.html.fromstring(r.content)
# links = tree.xpath('//div[@class="pbarTL"]') #辞書名
#links = tree.xpath('//div[@class="NetDicHead"]') #項目名のタグは辞書ごとに違うみたい
# links = tree.xpath('//h2[@class="midashigo"]' )#各辞書の見出し
# links = tree.xpath('//div[@class="sideRWordsL"]/a') #関連語
links = tree.xpath('//*[@id="topicWrp"]/span' ) #全体の見出し
for link in links:
print (link.text_content())
print (link.attrib['href'])
# ## Encyclopedia.com
# In[13]:
def myjoin(list):
result = ''
for i in list:
result += str(i)
return result
class encyclopediacom_info(Searcher):
def make_resource_list(self):
url = 'http://www.encyclopedia.com/searchresults.aspx?q=' + self.keyword
content = requests.get(url, headers=headers).text
soup = BeautifulSoup(content, "lxml")
#<a class="maltlink" href="/topic/encyclopedia.aspx" onclick="OmnitureClick('SearchResults20008.topiclink');">encyclopedia</a>
maltlink_soup = soup.find('a', class_='maltlink')
if maltlink_soup:
url_list = 'http://www.encyclopedia.com' + dict(maltlink_soup.attrs)['href'],
return url_list
else:
return []
def fetch_data(self, url):
'''
URLを開いてパースして拾い出す
'''
# driver = webdriver.PhantomJS()
# driver.get(url)
# r = driver.page_source
r = requests.get(url)
soup = BeautifulSoup(r.text,"lxml")
article_titles = soup.findAll('h2', class_ ='doctitle')
source_titles = soup.findAll('span', class_='pub-name')
be_doc_texts = soup.findAll('div', id='be-doc-text')
APA_citations = soup.findAll('div', id='divAPA')
df = pd.DataFrame()
for article_title, source_title, doc_text, apa in zip(article_titles, source_titles, be_doc_texts, APA_citations):
result_dict = OrderedDict()
result_dict['item_title'] = article_title.text,
result_dict['dict_name'] = source_title.text,
#result_dict['description'] = doc_text.text,
result_dict['description'] = myjoin(doc_text.contents),
#result_dict['citation'] = apa.find('p', class_='cittext').text,
result_dict['citation'] = myjoin(apa.find('p', class_='cittext')),
result_dict['des_size'] = len(doc_text.text),
df = df.append(pd.DataFrame(result_dict))
return df
def arrange(self, df):
if not df.empty:
df = df[df.des_size >0] #長さゼロは抜く
df = df.sort_values(by='des_size') #並べ替え
df.index = range(1, len(df)+1) #最後にインデックスつけ
return df
# ## Wikipedias
# In[14]:
class Wikipedia_Info(object):
def __init__(self, keyword, language='en'):
self.keyword = keyword
self.language = language
wikipedia.set_lang(self.language)
def fetch_page(self):
language_dict = wikipedia.languages()
try:
page = wikipedia.WikipediaPage(self.keyword)
except:
print ('Page {} does not match any pages in {} '.format(self.keyword, 'Wikipedia ' + language_dict[self.language]))
else:
result_dict = OrderedDict()
result_dict['item_title'] = page.original_title,
result_dict['dict_name'] = 'Wikipedia ' + language_dict[self.language],
result_dict['description'] = page.content,
result_dict['des_size'] = len(page.content),
result_dict['url'] = page.url,
wiki_df = pd.DataFrame(result_dict)
return wiki_df
# In[15]:
def interlanguage(keyword, language='ja'):
wikipedia.set_lang(language)
try:
page = wikipedia.WikipediaPage(keyword)
except ZeroDivisionError as e:
print (str(type(e)), '\n', str(e))
return
except Exception as e:
print (str(type(e)), '\n', str(e))
language_dict = wikipedia.languages()
print ('Page {} does not match any pages in {} '.format(keyword, 'Wikipedia ' + language_dict[language]))
return
url = page.url
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text,"lxml")
result_dict = OrderedDict()
# <li class="interlanguage-link interwiki-ab">
# <a href="//ab.wikipedia.org/wiki/%D0%90%D0%BB%D0%B0" title="アブハズ語: Ала" lang="ab" hreflang="ab">Аҧсшәа</a>
# </li>
interlanguage_link_soups = soup.findAll("li", class_=re.compile(r'interlanguage.+'))
#print (interlanguage_link_soups)
if interlanguage_link_soups:
result_dict['word'] = [dict(l.a.attrs)['title'].split(': ')[1] for l in interlanguage_link_soups]
result_dict['language'] = [dict(l.a.attrs)['title'].split(': ')[0] for l in interlanguage_link_soups]
result_dict['lang'] = [l.text for l in interlanguage_link_soups]
result_dict['url'] = ['https:' +dict(l.a.attrs)['href'] for l in interlanguage_link_soups]
df = pd.DataFrame(result_dict)
#df = df[df['lang'].str.contains('English|Français|Deutsch|EspañolItaliano|Esperanto|Русский|العربية|Latina|Ελληνικά|中文|فارسی|संस्कृतम्')]
return df
# ## Wikitionaries
# In[16]:
def wikitionary(keyword):
url = 'http://en.wiktionary.org/w/api.php?format=xml&action=query&prop=revisions&titles={}&rvprop=content'.format(keyword)
r = requests.get(url)
print (r.text)
# ## 図書館関係
# In[17]:
class library_info_list(Searcher):
default_max_page = 20
def __init__(self, keyword, max_page = default_max_page):
self.keyword = keyword
self.max_onepage = max_page
def get_hit_num(self, firstpage = 0):
url = self.get_query(firstpage, max_page = 1)
r = requests.get(url)
soup = BeautifulSoup(r.text.encode(r.encoding), "lxml")
hit_num_result = self.parse_hit_num(soup)
if hit_num_result:
return int(hit_num_result.group(1))
else:
return 0
def make_resource_list(self):
last_page = self.get_hit_num()
return [self.get_query(page, max_page = self.max_onepage) for page in range (0, last_page, self.max_onepage)]
def fetch_data(self, url):
result_df = pd.DataFrame()
r = requests.get(url)
soup = BeautifulSoup(r.text.encode(r.encoding), "lxml")
for item_soup in self.get_item_soup(soup):
result_dict = self.parse_data(item_soup)
result_df = result_df.append(pd.DataFrame(result_dict))
return result_df
def arrange(self, df):
if not df.empty:
df.index = range(1, len(df)+1) #最後にインデックスつけ
return df
# ## 国会図書館サーチ
# In[18]:
class ndl_list_info(Searcher):
def get_query(self):
print ('http://iss.ndl.go.jp/api/opensearch?any={}'.format(self.keyword))
return 'http://iss.ndl.go.jp/api/opensearch?any={}&cnt=500'.format(self.keyword)
def fetch_data(self, soup):
#基本項目
basic_tag_list =['category','guid','dc:title','dcndl:titletranscription',
'dc:creator','dcndl:edition','dc:publisher','dcterms:issued','dc:subject']
#print soup
result_dict = OrderedDict()
for tag in basic_tag_list:
tag_soup = soup.find(tag)
if tag_soup:
result_dict[tag] = tag_soup.text.strip(),
else:
result_dict[tag] = '',
#identifier
identifier_type_list = ['dcndl:ISBN', 'dcndl:JPNO']
for tag in identifier_type_list:
tag_content = soup.find('dc:identifier', {'xsi:type' : tag})
if tag_content:
result_dict[tag] = tag_content.text.strip(),
else:
result_dict[tag] = '',
#seeAlso
result_dict['ndldc_url'] = result_dict['ndlopac_url'] = result_dict['jairo_url'] = result_dict['cinii_url'] = '',
for also_soup in soup.findAll('rdfs:seealso'):
resource_url = dict(also_soup.attrs)['rdf:resource']
if r'http://dl.ndl.go.jp' in resource_url:
result_dict['ndldc_url'] = resource_url, #デジタルコレクション
elif r'http://id.ndl.go.jp/bib' in resource_url:
result_dict['ndlopac_url'] = resource_url, #NDL−OPAC詳細、複写依頼も
elif r'http://jairo.nii.ac.jp' in resource_url:
result_dict['jairo_url'] = resource_url, #JAIRO
elif r'http://ci.nii.ac.jp' in resource_url:
result_dict['cinii_url'] = resource_url, #CiNii
return pd.DataFrame(result_dict)
def make_resource_list(self):
url = self.get_query()
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
return soup.findAll("item")
def arrange(self, df):
if not df.empty:
df.columns = ['category','ndl_url','title','titletranscription','creator',
'edition','publisher','issued','subject','isbn','jpno',
'ndldc_url','ndlopac_url','jairo_url','cinii_url']
df = df.sort_values(by='category')
df.index = range(1, len(df)+1) #最後にインデックスつけ
return df
# ## CiNii Article
# In[19]:
class CiNii_list_info(library_info_list):
default_max_page = 200
#http://ci.nii.ac.jp/search?q=%E6%96%87%E5%8C%96%E7%A5%AD&range=0&sortorder=1&count=200&start=201
def get_query(self, page, max_page):
return 'http://ci.nii.ac.jp/search?' + 'q=' + str(self.keyword) + '&&range=0&sortorder=1&count=' + str(max_page) + '&start=' + str(page)
def make_resource_list(self):
last_page = self.get_hit_num()
return [self.get_query(page, max_page = self.max_onepage) for page in range (1, last_page, self.max_onepage)]
#特異
def parse_hit_num(self, soup):
#<h1 class="heading"?
# <span class="hitNumLabel">検索結果</span>
#
# 407件中&nbsp;1-200&nbsp;を表示
hit_soup = soup.find("h1", class_="heading")
recom = re.compile('([0-9]+)件中')
return recom.search(hit_soup.text)
def get_item_soup(self, soup):
return soup.findAll("dl", class_="paper_class")
def get_detail_title(self, soup):
return soup.find("dt", class_="item_mainTitle item_title").text.strip()
# <p class="item_subData item_authordata">
def get_detail_author(self, soup):
return soup.find("p", class_="item_subData item_authordata").text.replace('\n','').replace('\t','').strip()
def get_detail_journaldata(self, soup):
return soup.find("span", class_="journal_title").text.strip()
def get_detail_url(self, soup):
title_soup = soup.find("dt", class_="item_mainTitle item_title")
return 'http://ci.nii.ac.jp' + dict(title_soup.a.attrs)['href']
# <p class="item_otherdata">
def get_otherdata_url(self, soup):
title_soup = soup.find("p", class_="item_otherdata")
if title_soup.a:
return 'http://ci.nii.ac.jp' + dict(title_soup.a.attrs)['href']
else:
return ''
def parse_data(self, soup):
result_dict = OrderedDict()
result_dict['title'] = self.get_detail_title(soup),
result_dict['author'] = self.get_detail_author(soup),
result_dict['journal'] = self.get_detail_journaldata(soup),
result_dict['url'] = self.get_detail_url(soup),
result_dict['ext_url'] = self.get_otherdata_url(soup),
return result_dict
# ## Webcat Plus Minus
# In[20]:
class webcat_list_info(library_info_list):
default_max_page = 300
def get_query(self, page, max_page):
return 'http://webcatplus.nii.ac.jp/pro/?' + 'q=' + str(self.keyword) + '&n=' + str(max_page) + '&o=yd&lang=ja&s=' + str(page)
def parse_hit_num(self, soup):
# <div id="hit">検索結果 803件中
hit_soup = soup.find("div", id="hit")
recom = re.compile(r'検索結果 ([0-9]+)件中')
return recom.search(hit_soup.string)
def fetch_data(self, url):
result_df = pd.DataFrame()
r = requests.get(url)
tree = lxml.html.fromstring(r.content)
titles = tree.xpath('//*[@id="docs"]/ol/li/div/a')
descris = tree.xpath('//*[@id="docs"]/ol/li/div[2]')
for title,descri in zip(titles, descris):
#print link.text,link.attrib['href'],descri.text
result_dict = OrderedDict()
result_dict['title'] = title.text,
result_dict['description'] = descri.text,
result_dict['url'] = 'http://webcatplus.nii.ac.jp/' + title.attrib['href'],
result_df = result_df.append(pd.DataFrame(result_dict))
return result_df
# In[21]:
class webcat_list_info_old(library_info_list):
default_max_page = 300
#特異
def get_query(self, page, max_page):
return 'http://webcatplus.nii.ac.jp/pro/?' + 'q=' + str(self.keyword) + '&n=' + str(max_page) + '&o=yd&lang=ja&s=' + str(page)
#特異
def parse_hit_num(self, soup):
# <div id="hit">検索結果 803件中
hit_soup = soup.find("div", id="hit")
recom = re.compile(r'検索結果 ([0-9]+)件中')
return recom.search(hit_soup.string.encode('utf-8'))
#<li class="doc">
#<div class="t"><a href="/webcatplus/details/book/30016474.html" target="webcatplus">深層学習</a></div>
#<div class="d">岡谷貴之 著, 講談社, 2015.4, 165p, <span class="st">機械学習プロフェッショナルシリーズ / 杉山将 編</span> </div>
#</li>
#特異
def get_item_soup(self, soup):
return soup.findAll("li", class_="doc")
#特異
def get_detail_title(self, soup):
return soup.find("div", class_="t").text
#特異
def get_detail_biblio(self, soup):
return soup.find("div", class_="d").text
#特異
def get_detail_url(self, soup):
title_soup = soup.find("div", class_="t")
return 'http://webcatplus.nii.ac.jp' + dict(title_soup.a.attrs)['href']
#特異
def parse_data(self, soup):
result_dict = OrderedDict()
result_dict['title'] = self.get_detail_title(soup),
result_dict['description'] = self.get_detail_biblio(soup),
result_dict['url'] = self.get_detail_url(soup),
return result_dict
# In[22]:
class webcat_list_info_title(webcat_list_info):
def get_query(self, page, max_page):
return 'http://webcatplus.nii.ac.jp/pro/?' + 't=' + self.keyword + '&n=' + str(max_page) + '&o=yd&lang=ja&s=' + str(page)
class title2webcat_list(Searcher):
def __init__(self, list):
self.list = list
def make_resource_list(self):
return self.list
def fetch_data(self, title):
w = webcat_list_info_title(title)
return w.collector()
def arrange(self, df):
if not df.empty:
df.index = range(1, len(df)+1) #最後にインデックスつけ
return df
# In[23]:
class webcat_info(Searcher):
def __init__(self, keyword):
self.keyword = keyword
def make_resource_list(self):
wl = webcat_list_info(self.keyword)
return list(wl.collector()['url'])
def howmany(self):
return len(self.make_resource_list())
def fetch_data(self, url):
result_df = pd.DataFrame(columns=['isbn','ncid','nbn','title','author','publisher','issued',
'pages','summary','contents'])
r = requests.get(url)
soup = BeautifulSoup(r.text.encode(r.encoding), "lxml")
# ISBN
isbn_soup = soup.find('th', text="ISBN")
if isbn_soup:
result_df['isbn'] = isbn_soup.next_sibling.next_sibling.text.strip(),
else:
result_df['isbn'] = '',
# NII書誌ID(NCID)
ncid_soup = soup.find('span', class_="ncid")
if ncid_soup:
result_df['ncid'] = ncid_soup.string,
else:
result_df['ncid'] = '',
# 全国書誌番号(JP番号)
nbn_soup = soup.find('span', class_="nbn")
if nbn_soup:
result_df['nbn'] = nbn_soup.string,
else:
result_df['nbn'] = '',
title = ''
title_soup = soup.find("h2", id="jsid-detailTabTitle")
if title_soup:
title = re.sub(r'[\s]+', " ", title_soup.text)
else:
title = ''
seriestitle_soup = soup.find('th', text="シリーズ名")
if seriestitle_soup:
seriestitle = re.sub(r'[\s]+', " ", seriestitle_soup.next_sibling.next_sibling.text)
title += ' ' + seriestitle
result_df['title'] = title,
author_soup = soup.find("p", class_="p-C")
if author_soup:
result_df['author'] = author_soup.string,
else:
result_df['author'] = '',
publisher_soup = soup.find('th', text="出版元")
if publisher_soup:
result_df['publisher'] = publisher_soup.next_sibling.next_element.next_element.strip(),
else:
result_df['publisher'] = '',
year_soup = soup.find('th', text="刊行年月")
if year_soup:
result_df['issued'] = year_soup.next_sibling.next_sibling.string,
else:
result_df['issued'] = '',
page_soup = soup.find('th', text="ページ数")
if page_soup:
result_df['pages'] = page_soup.next_sibling.next_sibling.text.strip(),
else:
result_df['pages'] = '',
#概要
summary_soup = soup.find("div", id="jsid-detailMainText").find("p", class_="p-A")
if summary_soup:
result_df['summary'] = summary_soup.string,
else:
result_df['summary'] = '',
#目次
contents = ''
mokuji_ul = soup.find("ul", class_="ul-A")
if mokuji_ul:
mokuji_list_soup = mokuji_ul.findAll("li")
for mokuji_item_soup in mokuji_list_soup:
contents += mokuji_item_soup.string + '\t'
#掲載作品
mokuji_div = soup.find("div", class_="table-A")
if mokuji_div:
mokuji_list_soup = mokuji_div.findAll("tr")
for mokuji_item_soup in mokuji_list_soup:
title_and_author = mokuji_item_soup.text
title_and_author = re.sub(r'\n[\n\s]+',' - ', title_and_author)
# print title_and_author.encode('utf-8')
title_and_author = title_and_author.replace('\n','')
contents += title_and_author + '\t'
contents = contents.replace('著作名著作者名\t','')
result_df['contents'] = contents,
return result_df
def arrange(self, df):
if not df.empty:
df.index = range(1, len(df)+1) #最後にインデックスつけ
return df
# ## WorldCat
# In[24]:
class Worldcat_list_info(library_info_list):
default_max_page = 10
#特異
def get_query(self, page, max_page):
return 'http://webcatplus.nii.ac.jp/pro/?' + 'q=' + str(self.keyword) + '&n=' + str(max_page) + '&o=yd&lang=ja&s=' + str(page)
#特異
def parse_hit_num(self, soup):
# <div id="hit">検索結果 803件中
hit_soup = soup.find("div", id="hit")
recom = re.compile(r'検索結果 ([0-9]+)件中')
return recom.search(hit_soup.string)
def fetch_data(self, url):
result_df = pd.DataFrame()
r = requests.get(url)
tree = lxml.html.fromstring(r.content)
titles = tree.xpath('//*[@id="docs"]/ol/li/div/a')
descris = tree.xpath('//*[@id="docs"]/ol/li/div[2]')
for title,descri in zip(titles, descris):
#print link.text,link.attrib['href'],descri.text
result_dict = OrderedDict()
result_dict['title'] = title.text,
result_dict['description'] = descri.text,
result_dict['url'] = 'http://webcatplus.nii.ac.jp/' + title.attrib['href'],
result_df = result_df.append(pd.DataFrame(result_dict))
return result_df
# ## レファレンス共同データベース
# In[25]:
class refcases_list_info(library_info_list):
default_max_page = 200
def make_resource_list(self):
last_page = self.get_hit_num() // self.max_onepage + 1
return [self.get_query(page, max_page = self.max_onepage) for page in range (1, last_page + 1)]
def get_query(self, page, max_page):
query = 'http://crd.ndl.go.jp/reference/modules/d3ndlcrdsearch/index.php?page=detail_list&type=reference'
query += '&mcmd=' + str(max_page) + '&st=score&asc=desc&kg1=99'
query += '&kw1=' + urllib.parse.quote(self.keyword) + '&kw_lk1=1&kg2=2&kw2=&kw_lk2=1&kg3=6&pg=' + str(page)
return query
def parse_hit_num(self, soup):
hit_soup = soup.find("div", id="TabbedPanels1")
recom = re.compile(r'検索結果 ([0-9]+)件中')
return recom.search(soup.text)
def get_hit_num(self):
url = self.get_query(page = 1, max_page = 10)
r = requests.get(url)
soup = BeautifulSoup(r.text.encode(r.encoding), "lxml")
hit_num_result = self.parse_hit_num(soup)
if hit_num_result:
return int(hit_num_result.group(1))
else:
return 0
def get_item_soup(self, soup):
table_soup = soup.find('table', class_='slTable')
return table_soup.findAll('a', {'href':re.compile(r'http://crd.ndl.go.jp/reference/modules/d3ndlcrdentry/index.php?.+')})
def get_detail_title(self, soup):
return soup.text
def get_detail_url(self, soup):
return dict(soup.attrs)['href']
def parse_data(self, soup):
result_dict = OrderedDict()
result_dict['title'] = self.get_detail_title(soup),
result_dict['detail_url'] = self.get_detail_url(soup),
return result_dict
# In[26]:
class Refcases_info(Searcher):
def __init__(self, keyword, search_num= 50):
self.keyword = keyword
self.search_num = search_num
def make_resource_list(self):
rl = refcases_list_info(self.keyword)
return list(rl.collector()['detail_url'])[:self.search_num]
def howmany(self):
return len(self.make_resource_list())
def fetch_data(self, url):
result_dict = OrderedDict()
r = requests.get(url)
soup = BeautifulSoup(r.text.encode(r.encoding), "lxml")
#<div class="refCaseBox">
refcases_soup = soup.find('table', class_="refCaseTable")
th_soups = refcases_soup.findAll('th')
td_soups = refcases_soup.findAll('td')
for th_soup, td_soup in zip(th_soups, td_soups):
result_dict[th_soup.text] = th_soup.next_sibling.text,
result_dict['url'] = url
return pd.DataFrame(result_dict)
def arrange(self, df):
if not df.empty:
df.index = range(1, len(df)+1) #最後にインデックスつけ
return df
# ## 複数のブックリストを束ねて詳細を得る
# In[27]:
class list2webcat_info(webcat_info):
def __init__(self, url_list):
self.url_list = url_list
def make_resource_list(self):
return self.url_list
def nbn2webcaturl(nbn):
return 'http://webcatplus.nii.ac.jp/webcatplus/details/book/nbn/{}.html'.format(nbn)
def ncid2webcaturl(ncid):
return 'http://webcatplus.nii.ac.jp/webcatplus/details/book/ncid/{}.html'.format(ncid)
def isbn2webcaturl(isbn):
url = 'http://webcatplus.nii.ac.jp/pro/?i=' + isbn
content = requests.get(url).text
soup = BeautifulSoup(content, "lxml")
title_soup = soup.find('li', class_='doc')
if title_soup:
return 'http://webcatplus.nii.ac.jp' + dict(title_soup.find('div', class_='t').a.attrs)['href']
else:
return None
def title2webcaturl(title):
url = 'http://webcatplus.nii.ac.jp/pro/?t=' + title
content = requests.get(url).text
soup = BeautifulSoup(content, "lxml")
title_soup = soup.find('li', class_='doc')
if title_soup:
return 'http://webcatplus.nii.ac.jp' + dict(title_soup.find('div', class_='t').a.attrs)['href']
else:
return None
def asin2webcaturl(asin):
if asin[0].isdigit():
#そのままISBNとしてurlをつくる
return isbn2webcaturl(asin)
else:
#Amazonでタイトル取得
items = item_search(asin)
title = items[0].find('title').text
return title2webcaturl (title)
def df2webcaturl(df):
if 'url' in df.columns and 'http://webcatplus.nii.ac.jp' in df['url'].iat[0]:
return [url for url in df['url']]
elif 'isbn' in df.columns:
return [isbn2webcaturl(isbn) for isbn in df['isbn']]
elif 'asin' in df.columns:
return [asin2webcaturl(asin) for asin in df['asin']]
elif 'title' in df.columns:
return [title2webcaturl(title) for title in df['title']]
def df_list2webcaturl(df_list):
'''
ブックリスト系のデータフレームを複数受けて、重複を除いてwebcat詳細ページのurlリストを返す
'''
urls =[]
for df in df_list:
#print type(df)
for url in df2webcaturl(df):
if url and not url in urls:
urls.append(url)
return urls
def make_biblio(df_list):
urls = df_list2webcaturl(df_list)
#print urls
w = list2webcat_info(urls)
return w.collector()
# In[28]:
def keyword2webcaturl(keyword):
url = 'http://webcatplus.nii.ac.jp/pro/?m=b&q=' + keyword
content = requests.get(url).text
soup = BeautifulSoup(content, "lxml")
title_soup = soup.find('li', class_='doc')
if title_soup:
#タイトルとurlのリストを返す
return [title_soup.find('div', class_='t').text, 'http://webcatplus.nii.ac.jp' + dict(title_soup.find('div', class_='t').a.attrs)['href'] ]
else:
return None
def textlist2webcaturl(text):
'''
text2bib関数の補助というか中身
'''
df = pd.DataFrame(columns=['description','url'])
for line in text.split('\n'):
#print 'this book is being searched ' + line
line = re.sub('[『』、,]', ' ', line)
splited_line = line.split()
#見つかるまで少しずつキーワードを縮めていく(最低2個)
for i in range(len(splited_line),1,-1):
keyword = '+'.join(splited_line[0:i])
#print keyword
now_result = keyword2webcaturl(keyword)
if now_result:
#print 'hit ' +now_result[0] + now_result[1]
df = df.append(pd.DataFrame({'description':[line],'url':[now_result[1]]}))
break
df.index = range(1, len(df)+1) #最後にインデックスつけ
return df
@register_cell_magic
def text2bib(line, cell):
'''
テキスト形式の書誌情報からwebcat pro詳細ページのurlを取得して書誌データフレームに統合できる形式で返す
返り値:DataFrame
書式:
%%text2bib
(書誌情報1:タイトル 著者名 など;スペースなどで区切る。次項目とは改行区切り)
(書誌情報2:タイトル 著者名 など;スペースなどで区切る。次項目とは改行区切り)
  ……
例:
%%text2bib
フランクリン R.バーリンゲーム 中村保男訳. 時事通信社, 1956.
フランクリン研究 その経済思想を中心として 久保芳和 関書院, 1957.
フランクリンとアメリカ文学 渡辺利雄 研究社出版, 1980.4. 研究社選書
進歩がまだ希望であった頃 フランクリンと福沢諭吉 平川祐弘 新潮社, 1984.9. のち講談社学術文庫
フランクリン 板倉聖宣 仮説社, 1996.8. やまねこ文庫
ベンジャミン・フランクリン、アメリカ人になる ゴードン・S・ウッド 池田年穂,金井光太朗,肥後本芳男訳. 慶應義塾大学出版会
'''
sio = io.StringIO(cell)
return textlist2webcaturl(sio.getvalue())
# ## Amazon Review
# In[29]:
def isbn13to10(isbn):
#978-4-532-35628-6
isbn = isbn.replace('-','')
a = re.search(r'978([0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9])[0-9]', str(isbn))
if a:
isbn9 = a.group(1).replace('-','')
#print isbn9
list_ismb9 = list(str(isbn9))
#print list_ismb9
list_ismb9.reverse()
sum = 0
w = 2
for i in list_ismb9:
sum += int(i) * w
w += 1
d = 11 - (sum % 11)
if d == 11:
d = '0'
elif d == 10:
d = 'x'
else:
d = str(d)
return isbn9 + d
else:
return isbn
def get_review_url(ASIN):
return 'http://www.amazon.co.jp/product-reviews/' + str(ASIN) + '/ref=cm_cr_dp_see_all_summary?ie=UTF8&showViewpoints=1&sortBy=byRankDescending'
# def get_review_url2(ASIN):
# return 'http://www.amazon.co.jp/product-reviews/' + \
# str(ASIN) + '/ref=cm_cr_dp_see_all_summary?ie=UTF8&pageNumber=2&sortBy=byRankDescending'
# <span class="a-size-medium a-text-beside-button totalReviewCount">18</span>
def fetch_review(url):
r = requests.get(url)
if r.status_code == 200:
# soup = BeautifulSoup(r.text.encode(r.encoding), "html.parser")
soup = BeautifulSoup(r.text, "lxml")
#<span class="a-size-base review-text">レビュー</span>
review_title_list = soup.findAll("a", class_="a-size-base a-link-normal review-title a-color-base a-text-bold")
review_list = soup.findAll("span", class_="a-size-base review-text")
review_buffer =''
for review_title, review_item in zip(review_title_list, review_list):
review_buffer += '◯' +review_title.text +':'+ review_item.text+'\t'
return review_buffer
def amazon_review(ISBN):
return fetch_review(get_review_url(isbn13to10(ISBN)))
class amazon_reviews(Searcher):
def __init__(self, df):
self.biblio_df = df
def make_resource_list(self):
if 'isbn' in self.biblio_df.columns:
return list(self.biblio_df['isbn'])
elif 'asin' in self.biblio_df.columns:
return list(self.biblio_df['asin'])
else:
return None
def fetch_data(self, isbn):
result_df = pd.DataFrame()
if isbn:
url = get_review_url(isbn13to10(isbn))
r = requests.get(url)
if r.status_code == 200:
soup = BeautifulSoup(r.text, "lxml")
review_book = soup.find('a',class_="a-size-large a-link-normal")
review_title_list = soup.findAll("a", class_="a-size-base a-link-normal review-title a-color-base a-text-bold")
review_list = soup.findAll("span", class_="a-size-base review-text")
for review_title, review_item in zip(review_title_list, review_list):
result_dict = OrderedDict()
result_dict['isbn'] = isbn,
result_dict['book_title'] = review_book.text,
result_dict['review_title'] = review_title.text,
result_dict['description'] = review_item.text,
result_dict['des_size'] = len(review_item.text),
result_df = result_df.append(pd.DataFrame(result_dict))
return result_df
def arrange(self, df):
if not df.empty:
df.sort_values(by='book_title')
df.index = range(1, len(df)+1) #最後にインデックスつけ
return df
# ## 所在情報
# ## カリール
# In[30]:
liblary_columns=['systemid','systemname','formal','address','tel','geocode','category','url_pc']
def near_libraries_df(locate):
'''
地名を渡して付近の図書館のリスト(館名、住所、電話、geocode、URLなど)のデータフレームを返す
'''
df = pd.DataFrame(columns=liblary_columns)
#print locate
geocoding_url = 'http://www.geocoding.jp/api/?v=1.1&q={}'.format(locate)
r = requests.get(geocoding_url)
soup = BeautifulSoup(r.text.encode(r.encoding),"lxml")
#print soup
lng = soup.find('lng').text
lat = soup.find('lat').text
url = 'http://api.calil.jp/library?appkey={}&geocode={},{}'.format(calil_app_key, lng, lat)
r = requests.get(url)
soup = BeautifulSoup(r.text.encode(r.encoding),"lxml")
libraries = soup.findAll('library')
for library_soup in libraries:
now_dict = OrderedDict()
for now_column in liblary_columns:
now_dict[now_column] = [library_soup.find(now_column).text]
df = df.append(pd.DataFrame(now_dict))
df.index = range(1, len(df)+1) #最後にインデックスつけ
return df
def prefecture_libraries_df(locate):
df = pd.DataFrame(columns=liblary_columns)
url = 'http://api.calil.jp/library?appkey={}&pref={}'.format(calil_app_key, locate)
r = requests.get(url)
soup = BeautifulSoup(r.content,"lxml")
libraries = soup.findAll('library')
for library_soup in libraries:
now_dict = OrderedDict()
for now_column in liblary_columns:
now_dict[now_column] = [library_soup.find(now_column).text]
df = df.append(pd.DataFrame(now_dict))
df.index = range(1, len(df)+1) #最後にインデックスつけ
return df
def one_city_libraries_df(pref, city):
df = pd.DataFrame(columns=liblary_columns)
url = 'http://api.calil.jp/library?appkey={}&pref={}&city={}'.format(calil_app_key, pref, city)
#print url
r = requests.get(url)
soup = BeautifulSoup(r.content,"lxml")
#print soup
libraries = soup.findAll('library')
for library_soup in libraries:
now_dict = OrderedDict()
for now_column in liblary_columns:
now_dict[now_column] = [library_soup.find(now_column).text]
df = df.append(pd.DataFrame(now_dict))
df.index = range(1, len(df)+1) #最後にインデックスつけ
return df
def city_libraries_df(locate):
url = 'http://geoapi.heartrails.com/api/xml?method=suggest&matching=like&keyword={}'.format(locate)
r = requests.get(url)
soup = BeautifulSoup(r.content,"lxml")
#print soup, soup.find('error')
if soup.find('error'):
#市町村名じゃなかったら、近隣を探す
print (locate,'付近の図書館を検索')
return near_libraries_df(locate)
else:
#市町村名なら県名と市町村名を得て、それで探す
pref = soup.find('prefecture').text
city = soup.find('city').text
#print pref, city
if '区' in city:
#政令指定都市は区が入ってるので取り除く
city = re.sub('[^市]+区', '', city)
#print pref, city
print (pref, city, '内の図書館を検索')
return one_city_libraries_df(pref, city)
prefectures = ['北海道','青森県','岩手県','宮城県','秋田県','山形県','福島県','茨城県','栃木県','群馬県',
'埼玉県','千葉県','東京都','神奈川県','新潟県','富山県','石川県','福井県','山梨県','長野県',
'岐阜県','静岡県','愛知県','三重県','滋賀県','京都府','大阪府','兵庫県','奈良県','和歌山県',
'鳥取県','島根県','岡山県','広島県','山口県','徳島県','香川県','愛媛県','高知県','福岡県',
'佐賀県','長崎県','熊本県','大分県','宮崎県','鹿児島県','沖縄県']
def library_list(locate_list):
'''
地名のリストを渡して付近の図書館のリスト(館名、住所、電話、geocode、URLなど)のデータフレームを返す
'''
df = pd.DataFrame()
for locate in locate_list:
if locate in prefectures:
#県名のみなら県内の一覧を返す
df = df.append(prefecture_libraries_df(locate))
else:
df = df.append(city_libraries_df(locate))
df.index = range(1, len(df)+1) #最後にインデックスつけ
return df
# In[31]:
def get_library_data(df, now_systemid, request_column):
return df[df['systemid']==now_systemid][request_column].values[0]
def get_biblio_data(df, now_isbn, request_column):
return df[df['isbn']==now_isbn][request_column].values[0]
def library_status(biblio_df, library_df = MYLIBRARY):
'''
isbnを含むbiblioデータフレームと図書館リストのデータフレームを渡して、
isbn、タイトル、所蔵/貸し出し状況/予約URLについてデータフレームにまとめて返す
'''
isbn_list = [isbn for isbn in biblio_df['isbn']]
library_list = [library for library in library_df['systemid']]
df = pd.DataFrame(columns=['isbn','title','library','status','url'])
#一回目のリクエスト
api = 'http://api.calil.jp/check?appkey={}&isbn={}&systemid={}&format=xml'.format(calil_app_key, ','.join(isbn_list), ','.join(library_list))
r = requests.get(api)
soup = BeautifulSoup(r.text,"lxml")
#終了かどうか状況取得 0(偽)または1(真)。1の場合は、まだすべての取得が完了していない.
now_continue = soup.find('continue').text
while '0' not in now_continue:
#セッション情報取得
now_session = soup.find('session').text
#5秒待つ
time.sleep(5)
#再度のリクエスト
#print now_continue, "I'm waiting..."
api = 'http://api.calil.jp/check?appkey={}&session={}&format=xml'.format(calil_app_key, now_session)
r = requests.get(api)
soup = BeautifulSoup(r.text,"lxml")
now_continue = soup.find('continue').text
#ループを抜けたらパース処理
books = soup.findAll('book')
for booksoup in books:
now_isbn = dict(booksoup.attrs)['isbn']
#本ごとの下に各館の状況
librarysoups = booksoup.findAll('system')
for librarysoup in librarysoups:
#各館の下に貸出状況
now_systemid = dict(librarysoup.attrs)['systemid']
now_libkeys_soup = librarysoup.find('libkeys')
if now_libkeys_soup and now_libkeys_soup.text:
#libkeysがあれば本がある
now_libkeys = now_libkeys_soup.text
now_reserveurl = librarysoup.find('reserveurl').text
else:
now_libkeys = 'None'
now_reserveurl = 'None'
now_dict = OrderedDict()
now_dict['isbn'] = now_isbn,
now_dict['title'] = get_biblio_data(biblio_df, now_isbn, 'title'),
now_dict['library'] = get_library_data(library_df, now_systemid, 'formal'),
now_dict['status'] =now_libkeys,
now_dict['url'] =now_reserveurl,
df = df.append(pd.DataFrame(now_dict))
df = df[df['status']!='None']
df.index = range(1, len(df)+1) #最後にインデックスつけ
return df
# ## アマゾンの在庫/最安値チェック
# In[32]:
def amazon_stock_thisbook(keyword):
result_df = pd.DataFrame()
book_soups = item_search(keyword)
if book_soups:
for book_soup in book_soups:
result_dict = OrderedDict()
result_dict['asin'] = book_soup.find('asin').text,
result_dict['title'] = book_soup.find('title').text,
if book_soup.find('formattedprice'):
result_dict['listprice'] = book_soup.find('formattedprice').text,
offersummary_soup = book_soup.find('offersummary')
if offersummary_soup:
#print keyword, offersummary_soup
result_dict['totalnew'] = offersummary_soup.find('totalnew').text,
if offersummary_soup.find('lowestnewprice'):
result_dict['lowestnewprice'] = offersummary_soup.find('lowestnewprice').find('formattedprice').text,
else:
result_dict['lowestnewprice'] = 'None',
result_dict['totalused'] = offersummary_soup.find('totalused').text,
if offersummary_soup.find('lowestusedprice'):
result_dict['lowestusedprice'] = offersummary_soup.find('lowestusedprice').find('formattedprice').text,
else:
result_dict['lowestusedprice'] = 'None',
result_dict['url'] = 'http://www.amazon.co.jp/dp/{}/'.format(book_soup.find('asin').text)
result_df = result_df.append(pd.DataFrame(result_dict))
return result_df
def ciniibook_title(ncid):
url = 'http://ci.nii.ac.jp/ncid/{}'.format(ncid)
r = requests.get(url)
soup = BeautifulSoup(r.text.encode('utf-8'),"lxml")
book_title_soup = soup.find('meta', attrs={'name':'dc.title'})
if book_title_soup:
return dict(book_title_soup.attrs)['content']
def amazon_stocks(biblio_df):
result_df = pd.DataFrame(columns=['asin','title','listprice','totalnew','lowestnewprice','totalused','lowestusedprice','url'])
for i in range(0, len(biblio_df)):
print ('.', end ='')
if 'isbn' in biblio_df.columns:
isbn = isbn13to10(biblio_df['isbn'].iat[i])
elif 'asin' in biblio_df.columns:
isbn = biblio_df['asin'].iat[i]
else:
isbn = None
if 'ncid' in biblio_df.columns:
ncid = biblio_df['ncid'].iat[i]
else:
ncid = None
if 'title' in biblio_df.columns:
title = biblio_df['title'].iat[i]
else:
title = None
if isbn and len(amazon_stock_thisbook(isbn)):
result_df = result_df.append(amazon_stock_thisbook(isbn))
elif ncid and ciniibook_title(ncid):
result_df = result_df.append(amazon_stock_thisbook(ciniibook_title(ncid)))
elif len(amazon_stock_thisbook(str(title))):
result_df = result_df.append(amazon_stock_thisbook(str(title)))
else:
result_dict = OrderedDict()
result_dict['asin'] = isbn,
result_dict['title'] = title,
result_dict['listprice'] = 'None',
result_dict['totalnew'] = 'None',
result_dict['lowestnewprice'] = 'None',
result_dict['totalused'] ='None',
result_dict['lowestusedprice'] = 'None',
result_dict['url'] = 'None',
result_df = result_df.append(pd.DataFrame(result_dict))
result_df = result_df.sort_values(by='title')
result_df.index = range(1, len(result_df)+1) #最後にインデックスつけ
return result_df
# ## 日本の古本屋
# In[33]:
class Kosho_info(library_info_list):
default_max_page = 100
#https://www.kosho.or.jp/products/list.php?transactionid=f933edac26fe92ced0cdfbbd118b58d90e5e59f9&mode=search_retry&pageno=2&search_pageno=1&product_id=&reset_baseinfo_id=&baseinfo_id=&product_class_id=&quantity=1&from_mode=&search_facet_publisher=&search_word=%E6%A0%AA%E5%BC%8F%E6%8A%95%E8%B3%87&search_name=&search_name_matchtype=like&search_author=&search_author_matchtype=like&search_publisher=&search_publisher_matchtype=like&search_isbn=&search_published_year_min=&search_published_year_max=&search_comment4=&search_comment4_matchtype=like&search_book_flg=&search_price_min=&search_price_max=&search_only_has_stock=1&search_orderby=score&search_sorttype=asc&search_page_max=20&search_image_disp=&search_orderby_navi=score&search_sorttype_navi=asc&search_page_max_navi=20&search_image_disp_navi=&transactionid=f933edac26fe92ced0cdfbbd118b58d90e5e59f9
def get_query(self, page, max_page):
return 'https://www.kosho.or.jp/products/list.php?mode=search_retry&pageno=' + str(page) + '&search_pageno=1&quantity=1&search_word=' + self.keyword + '&search_only_has_stock=1&search_orderby=score&search_sorttype=asc&search_page_max=' + str(max_page)
def make_resource_list(self):
last_page = self.get_hit_num() // self.max_onepage + 1
return [self.get_query(page, max_page = self.max_onepage) for page in range (1, last_page + 1)]
#<form name="form1"
# <!--★件数-->
#<div><span class="attention">134件</span>が見つかりました。</div>
def parse_hit_num(self, soup):
hit_soup = soup.find("form", id="form1")
recom = re.compile('([0-9]+)件が見つかりました。')
return recom.search(hit_soup.text)
#<div class="search_result_list product_list">
def get_item_soup(self, soup):
return soup.findAll("div", class_="search_result_list product_list")
#<a href="javascript:goDetail(5185060);">株式投資=はじめるまえの50の常識</a>
def get_detail_title(self, soup):
return soup.find('a', {'href':re.compile(r'javascript:goDetail.+')}).text.strip()
#https://www.kosho.or.jp/products/detail.php?product_id=5185060
def get_detail_url(self, soup):
title_soup = soup.find('a', {'href':re.compile(r'javascript:goDetail.+')})
recom = re.compile(r'javascript:goDetail\(([0-9]+)\);')
m = recom.search(dict(title_soup.attrs)['href'])
return 'https://www.kosho.or.jp/products/detail.php?product_id=' + m.group(1)
#<div class="product_info wide">または<div class="product_info">
def get_detail_productinfo(self, soup):
info_soup = soup.find("div", class_=re.compile(r'product_info.*'))
if info_soup:
return info_soup.text.replace('\n','').replace('\t','').strip()
else:
return ''
#<span class="price">3,500</span>
def get_detail_price(self, soup):
return soup.find('span', class_='price').text.strip()
#特異
def parse_data(self, soup):
result_dict = OrderedDict()
result_dict['title'] = self.get_detail_title(soup),
result_dict['info'] = self.get_detail_productinfo(soup),
result_dict['price'] = self.get_detail_price(soup),
result_dict['url'] = self.get_detail_url(soup),
return result_dict
class Kosho_info_title(Kosho_info):
#https://www.kosho.or.jp/products/list.php?
#mode=search_retry&pageno=&search_pageno=1&product_id=&reset_baseinfo_id=&
#baseinfo_id=&product_class_id=&quantity=1&from_mode=&search_facet_publisher=
#&search_word=&search_name=%E7%A9%BA%E3%81%AE%E8%89%B2%E3%81%AB%E3%81%AB%E3%81%A6%E3%81%84%E3%82%8B
def get_query(self, page, max_page):
return 'https://www.kosho.or.jp/products/list.php?' + 'mode=search_retry&pageno=&search_pageno=1&product_id=&reset_baseinfo_id=&' + 'baseinfo_id=&product_class_id=&quantity=1&from_mode=&search_facet_publisher=' + '&search_only_has_stock=1&search_word=&search_name=' +self.keyword
def kosho_stocks(biblio_df):
result_df = pd.DataFrame()
for i in range(0, len(biblio_df)):
title = biblio_df['title'].iat[i]
k = Kosho_info_title(title)
df = k.collector()
result_df = result_df.append(df)
result_df = result_df.sort_values(by='title')
result_df.index = range(1, len(result_df)+1) #最後にインデックスつけ
return result_df
# ## 国会図書館・各県立図書館
# In[34]:
def house_books(soup):
#所蔵館と貸し出し状況
items = soup.findAll('dcndl:item')
#result = '所蔵館数:' + str( len(items)) + '\n'
result_df = pd.DataFrame(columns=['title','library_name','availability','description','url'])
for item in items:
print ('.', end ='')
result_dict = OrderedDict()
result_dict['title'] = soup.find('dcterms:title').text,
result_dict['library_name'] = item.find('foaf:name').text,
result_dict['availability'] = result_dict['description'] = result_dict['url'] ='',
if item.find('dcndl:availability'):
result_dict['availability'] = item.find('dcndl:availability').text,
if item.find('dcterms:description'):
result_dict['description'] = item.find('dcterms:description').text,
if item.find('rdfs:seealso'):
result_dict['url'] = dict(item.find('rdfs:seealso').attrs)['rdf:resource'] #各図書館の詳細ページ
result_df = result_df.append(pd.DataFrame(result_dict))
return result_df
def nbn2ndlurl(nbn):
url ='http://iss.ndl.go.jp/api/opensearch?jpno={}'.format(nbn)
#print (url)
try:
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.content, "lxml")
guid = soup.find('guid', ispermalink='true')
return guid.text
except:
return None
def ndl_stocks(df):
result_df = pd.DataFrame(columns=['title','library_name','availability','description','url'])
for nbn in df['nbn']:
url = nbn2ndlurl(nbn)
if url:
url_rdf = url + '.rdf'
r = requests.get(url_rdf)
soup = BeautifulSoup(r.content, "lxml")
result_df = result_df.append(house_books(soup))
result_df.index = range(1, len(result_df)+1) #最後にインデックスつけ
return result_df
# ## 国会図書館デジタルコレクション、複写サービス、外部リポジトリ
# In[35]:
def ndl_collection(df):
result_df = pd.DataFrame()
basic_term_list = ['dcterms:title','dc:creator','dcterms:issued']
resource_list=[r'id.ndl.go.jp',r'dl.ndl.go.jp',r'ci.nii.ac.jp',r'jairo.nii.ac.jp']
for url in df['ndl_url']:
url_rdf = url + '.rdf'
r = requests.get(url_rdf)
soup = BeautifulSoup(r.content, "lxml")
#print soup
result_dict = OrderedDict()
for basic_term in basic_term_list:
if soup.find(basic_term):
result_dict[basic_term] = soup.find(basic_term).text,
else:
result_dict[basic_term] = '',
for resource in resource_list:
result_dict[resource] = ''
seealso_soups = soup.findAll('rdfs:seealso')
for seealso_soup in seealso_soups:
#print seealso_soup
seealso_url = dict(seealso_soup.attrs)['rdf:resource']
#print seealso_url
for resource in resource_list:
if resource in seealso_url:
#print resource
result_dict[resource] = seealso_url,
result_df = result_df.append(pd.DataFrame(result_dict))
result_df.index = range(1, len(result_df)+1) #最後にインデックスつけ
return result_df
# ## Google検索
# In[36]:
class Google_info(Searcher):
default_max_page = 50
# https://www.google.co.jp/search?q=[keyword]&num=10&start=10
def get_query(self, page, max_page):
return 'https://www.google.co.jp/search?q={}&num={}&start={}'.format(self.keyword,max_page,page)
def make_resource_list(self):
last_page = 50
return [self.get_query(page, self.default_max_page) for page in range (0, last_page, self.default_max_page)]
def fetch_data(self, url):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'}
r = requests.get(url, headers=headers)
tree = lxml.html.fromstring(r.content.decode('utf-8'))
h3_soups = tree.xpath('//h3[@class="r"]/a')
st_soups = tree.xpath('//span[@class="st"]')
df = pd.DataFrame()
for i_h3,i_st in zip(h3_soups, st_soups):
result_dict = OrderedDict()
result_dict['title'] = i_h3.text,
result_dict['summary'] = i_st.text_content(),
result_dict['url'] = i_h3.attrib['href'],
df = df.append(pd.DataFrame(result_dict))
return df
def arrange(self, df):
if not df.empty:
df.index = range(1, len(df)+1) #最後にインデックスつけ
return df
# In[37]:
class Google_book_info(Google_info):
def get_query(self, page, max_page):
return 'https://www.google.co.jp/search?tbm=bks&q={}&num={}&start={}' .format(self.keyword,max_page,page)
# In[38]:
class Google_scholar_info(Google_info):
default_max_page = 10
def get_query(self, page, max_page):
return 'https://scholar.google.co.jp/scholar?q={}&start={}' .format(self.keyword, page)
def make_resource_list(self):
last_page = 100
return [self.get_query(page, self.default_max_page) for page in range (0, last_page, self.default_max_page)]
def fetch_data(self, url):
#print url
tree = lxml.html.parse(io.StringIO(requests.get(url, headers=headers).text))
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text.encode('utf-8'),"lxml")
df = pd.DataFrame()
items_soup = soup.findAll("div", class_="gs_r")
for item_soup in items_soup:
result_dict = OrderedDict()
h3_soup = item_soup.find("h3", class_="gs_rt")
if h3_soup:
result_dict['title'] = h3_soup.text,
if h3_soup.a:
result_dict['url'] = dict(h3_soup.a.attrs)['href'],
else:
result_dict['url'] = ''
resource_soup = item_soup.find("div", class_="gs_a")
if resource_soup:
result_dict['resource'] = resource_soup.text,
else:
result_dict['resource'] = ''
summary_soup = item_soup.find("div", class_="gs_rs")
if summary_soup:
result_dict['summary'] = summary_soup.text,
else:
result_dict['summary'] = ''
cites_soup = item_soup.find("div", class_="gs_fl")
if cites_soup and cites_soup.a:
result_dict['cited'] = dict(cites_soup.a.attrs)['href'],
else:
result_dict['cited'] = ''
download_soup = item_soup.find("div", class_="gs_md_wp gs_ttss")
if download_soup and download_soup.a:
result_dict['download_url'] = dict(download_soup.a.attrs)['href'],
else:
result_dict['download_url'] = ''
df = df.append(pd.DataFrame(result_dict))
return df
# ## Library Genesis: Scientific Articles
# In[39]:
class LibraryGenesisArticles(Google_info):
default_max_page = 25
def get_query(self, page):
return 'http://gen.lib.rus.ec/scimag/index.php?s={}&page={}' .format(self.keyword, page)
def parse_hit_num(self, soup):
recom = re.compile('Found ([0-9]+) results')
return recom.search(soup.text)
def get_hit_num(self):
url = self.get_query(page = 1)
r = requests.get(url)
soup = BeautifulSoup(r.text.encode(r.encoding), "lxml")
hit_num_result = self.parse_hit_num(soup)
if hit_num_result:
return int(hit_num_result.group(1))
else:
return 0
def make_resource_list(self):
last_page = self.get_hit_num ()
return [self.get_query(page/self.default_max_page +1) for page in range (0, last_page, self.default_max_page)]
def fetch_data(self, url):
tree = lxml.html.parse(url)
root="/html/body/table[2]/tr"
item_dict = {"doi" : "td[1]/table/tr[1]/td[2]",
"author" : "td[2]",
"title" : "td[3]",
"doiowner" : "td[4]",
"journal" : "td[5]/a",
"issue" : "td[6]",
"issn" : "td[7]",
"sizekb" : "td[8]",
"others" : "td[9]"}
link_dict = {
"Libgen_url" : "td[1]/table/tr[2]/td[2]/a"}
df = pd.DataFrame()
for i in range(1,26):
result_dict = OrderedDict()
for item_name in item_dict.keys():
now_xpath_format = root + '[{}]/' + item_dict[item_name]
now_xpath = now_xpath_format.format(i)
#print now_xpath
now_tree_xpath = tree.xpath(now_xpath)
if now_tree_xpath:
result_dict[item_name] = now_tree_xpath[0].text_content(),
else:
result_dict[item_name] = '',
for item_name in link_dict.keys():
now_xpath_format = root + '[{}]/' + link_dict[item_name]
now_xpath = now_xpath_format.format(i)
#print now_xpath
now_tree_xpath = tree.xpath(now_xpath)
if now_tree_xpath and ('a' in link_dict[item_name].split('/')[-1]):
#末尾が/aならリンクを取得する
result_dict[item_name] = now_tree_xpath[0].attrib['href'],
df = df.append(pd.DataFrame(result_dict))
return df
# ## Graphviz関係
# In[53]:
@register_cell_magic
def csv2dot(line, cell):
'''
コンマ区切りのテキストからネットワーク図をつくる
書式例:
%%csv2dot
角倉了以,日本大百科全書,角倉了以とその子
角倉了以,朝日日本歴史人物事典,「角倉文書」(『大日本史料』12編14所収)
角倉了以,参考調査便覧,宮本又次他『日本貿易人の系譜』有斐閣1980
'''
G =Digraph()
G.graph_attr.update(layout = 'dot', rankdir='LR', concentrate = 'true')
for line in cell.split('\n'):
items = line.split(',')
for i in range(len(items))[:-1]:
print (items[i],'->', items[i+1])
G.edge(items[i], items[i+1])
return G
# In[57]:
@register_cell_magic
def indent2dot(line, cell):
'''
タブ区切りのインデント付きテキストからネットワーク図をつくる
書式例:
%%indent2dot
角倉了以
日本大百科全書
角倉了以とその子
朝日日本歴史人物事典
「角倉文書」(『大日本史料』12編14所収)
角倉了以とその子
参考調査便覧
宮本又次他『日本貿易人の系譜』有斐閣1980
'''
G = Digraph()
G.graph_attr.update(layout = 'dot', rankdir='LR', concentrate = 'true')
node_stack = [cell.split('\n')[0]]
for line in cell.split('\n')[1:]:
#print line, line.count(' ',0), len(node_stack),node_stack
if len(node_stack) < line.count(' ',0)+1:
#1つ下がる
node_stack.append(line.strip())
elif len(node_stack) == line.count(' ',0)+1:
#同じレベル
node_stack.pop()
node_stack.append(line.strip())
elif len(node_stack) > line.count(' ',0)+1:
node_stack.pop()
node_stack.pop()
node_stack.append(line.strip())
G.edge(node_stack[-2], node_stack[-1])
#print node_stack[-2],'->', node_stack[-1]
return G
# ## Theme Map by Wikipedia Category
# In[103]:
class Thememap(object):
def __init__(self, first_keyword, max_level = 2):
self.G = Digraph()
self.first_keyword = first_keyword
self.max_level = max_level
self.all_categories = []
self.node_links = {}
# キーワードを受けて上位概念のリストを返す
def fetch_category(self, keyword):
url = "https://ja.wikipedia.org/wiki/" + keyword
r = requests.get(url)
soup = BeautifulSoup(r.text.encode('utf-8'),"lxml")
return_buffer = []
# 上位概念はカテゴリから抽出
#<div id="mw-normal-catlinks" class="mw-normal-catlinks">
# <li><a href="/wiki/Category:%E9%8A%80%E8%A1%8C" title="Category:銀行">銀行</a></li>
category_div_soup = soup.find("div", class_="mw-normal-catlinks")
if category_div_soup:
category_list_soup = category_div_soup.findAll("li")
for category_soup in category_list_soup:
return_buffer.append(category_soup.a.string)
return return_buffer
def regist_link(self, link_name, link_words):
# 短いリンクを優先
#if link_name not in self.node_links:
if link_name not in self.node_links:
self.node_links[link_name] = link_words
def dot(self, start_word, now_level):
if now_level > self.max_level:
return
else:
now_fetch_categories = self.fetch_category(start_word)
if now_fetch_categories:
for now_category in now_fetch_categories:
print ('.', end ='')
# print now_category
if now_category in self.all_categories:
continue
else:
# print now_category のカテゴリ
link_word = start_word
start_word = start_word.replace('Category:','')
start_word = start_word.replace('のカテゴリ','')
self.regist_link(start_word, link_word)
if now_level+1 > self.max_level:
#枝の先は未処理になるのでここでリンクだけつくっておく
self.regist_link(now_category, 'Category:'+now_category)
#エッジを追加
self.G.edge(start_word, now_category)
#一度出てきたカテゴリは記録して重複を防ぐ
self.all_categories.append(now_category)
#下位レベルへ再帰
self.dot('Category:'+now_category, now_level+1)
def draw(self):
self.dot(self.first_keyword, 0)
# ノードごとにurlリンクをつける
for link_name, link_word in self.node_links.items():
self.G.node(link_name, shape = 'plaintext', href='https://ja.wikipedia.org/wiki/' + link_word)
self.G.graph_attr.update(layout = 'dot', rankdir='LR', concentrate = 'true', size="50,50")
return self.G
# ## Google Suggest Map
# In[109]:
def uniq(seq):
seen = set()
seen_add = seen.add
return [ x for x in seq if x not in seen and not seen_add(x)]
class SuggestMap(object):
def __init__(self, keyword, expand_flag=0):
self.keyword = keyword
self.expand_flag = expand_flag
self.suggested_list = []
self.node_links = {}
self.row = 'digraph{ graph[layout = neato, concentrate = true, overlap = false, splines = true, size="10,10"];'
self.row += self.keyword +' [shape=circle];\n'
self.row += 'node [shape = plaintext];\n'
#googleサジェストで情報集め
def suggest(self, keyword):
suggested_list = []
tail_list = ['',' ','_']
# 時間はかかるけど豊かなマップが書けるオプション
if self.expand_flag == 1:
tail_list.extend([' ' + chr(i+97) for i in range(26)]) #アルファベット全部
#2回めは添付しないとかなり時間は短縮できて結果は同じ
self.expand_flag = 0
elif self.expand_flag == 2:
tail_list.extend([' ' + chr(i+12450) for i in range(89)]) #カタカナ全部
#2回めは添付しないとかなり時間は短縮できて結果は同じ
self.expand_flag = 0
for tail in tail_list:
url = "http://www.google.com/complete/search?hl=ja&q="+keyword+tail+"&output=toolbar"
#print url.encode('utf-8')
r = requests.get(url)
# print r.encoding
soup = BeautifulSoup(r.text.encode('utf-8'),"lxml")
#<suggestion data="機械翻訳 比較"/>
suggest_list_soup = soup.findAll("suggestion")
for suggest_soup in suggest_list_soup:
print ('.', end ='')
now_suggested_text = dict(suggest_soup.attrs)['data']
now_suggested = keyword + ' ' + now_suggested_text
# 重複外し
now_suggested = ' '.join(uniq(now_suggested.split(' ')))
suggested_list.append(now_suggested)
return suggested_list
def regist_link(self, link_name, suggested_words):
# 短いリンクを優先
# print 'リンクする', link_name, suggested_words,len(suggested_words.split('+'))
if link_name in self.node_links:
# print 'リンク既出:', node_links[link_name],len(node_links[link_name].split(' '))
if len(self.node_links[link_name].split(' ')) > len(suggested_words.split(' ')):
# print 'よりシンプルなリンクなので:登録'
self.node_links[link_name] = suggested_words
else:
# print 'リンク初出:登録'
self.node_links[link_name] = suggested_words
def get_src_link(self, now_suggested):
# print now_suggested.encode('utf-8')
now_src = '"' + re.sub(r'[\s]+', '" -> "', now_suggested) + '";\n'
# 孔子 [href="http://ja.wikipedia.org/wiki/孔子"];
if now_suggested.split(' ')[-1] in self.keyword:
self.regist_link(now_suggested.split(' ')[0] ,now_suggested)
# print now_node_link.encode('utf-8')
else:
self.regist_link(now_suggested.split(' ')[-1] ,now_suggested)
return now_src
def draw(self):
first_list = self.suggest(self.keyword)
suggested_list = first_list
src = ''
for key in first_list:
for now_suggested in self.suggest(key):
src += self.get_src_link(now_suggested)
node_link_buffer = ''
for link_name, suggested_words in self.node_links.items():
node_link_buffer += '"'+ link_name + '" [href="https://www.google.co.jp/search?q='+suggested_words+'"];\n'
self.row += node_link_buffer + src + '}'
return Source(self.row)
# ### 関連マップの元
# In[123]:
class RelateMap(object):
def __init__(self, keyword, max_depth, max_width):
print ('depth: ' + str(max_depth) + ' width: ' + str(max_width))
self.G = Digraph()
self.keyword = keyword
self.max_depth = max_depth
self.max_width = max_width
self.nodes = set()
self.result_df = pd.DataFrame(columns=['label','xlabel','url'])
self.construct_graph()
def construct_graph(self):
start_nodes = self.get_start_node()
for start_node in start_nodes:
self.build_edges(start_node, 0)
self.set_nodes_attr()
self.set_graph_attr()
def build_edges(self, start_node, now_depth):
if now_depth >= self.max_depth:
return
for next_node in self.get_nextnodes(start_node):
print ('.', end = '')
self.G.edge(start_node, next_node)
self.nodes.add(start_node)
self.nodes.add(next_node)
self.build_edges(next_node, now_depth +1)
def set_nodes_attr(self):
for n in self.nodes:
print ('.', end = '')
nlabel = self.get_node_label(n)
nxlabel = self.get_node_xlabel(n)
nshape = self.get_node_shape(n)
nhref = self.get_node_href(n)
nimage = self.get_node_image(n)
self.G.node(n, label = nlabel, bottomlabel= nxlabel, shape = nshape, href = nhref, image =nimage)
self.result_df = self.result_df.append(pd.DataFrame({'label': [nlabel], 'xlabel': [nxlabel], 'url' : [nhref]}))
def get_node_label(self, node):
return node
def get_node_xlabel(self, node):
return ''
def get_node_shape(self, node):
return 'plaintext'
def get_node_href(self, node):
return ''
def get_node_image(self, node):
return ''
def set_graph_attr(self):
self.G.graph_attr.update(layout = 'neato', rankdir='LR', concentrate = 'true', overlap = 'false', splines = 'true')
def g(self, size='10'):
'''
グラフを描写してHTMLに埋め込むメソッド
例: _.g()
直前のRelateMapオブジェクトを受けて、グラフを描写する
例: _.g(20)
直前のRelateMapオブジェクトを受けて、画面の2倍の大きさでグラフを描写する
'''
self.G.attr('graph', size = '{},{}'.format(size, size))
return self.G
def df(self):
'''
データフレーム(表)に書き出すメソッド
'''
self.result_df.index = range(1, len(self.result_df)+1) #最後にインデックスつけ
return self.result_df
# ### Amazon商品情報による関連マップ
# In[124]:
class AmazonRelateMap(RelateMap):
title_dict = {}
def get_start_node(self):
items = item_search(self.keyword, item_page = 1)[:self.max_width]
result = []
for item in items:
now_asin = item.find('asin').text
now_title = item.find('title').text
self.title_dict[now_asin] = now_title
result.append(now_asin)
return result
def get_nextnodes(self, asin):
'''
asinを受けて類似本のasinのリストを返す
'''
response = amazon.SimilarityLookup(ItemId=asin, ResponseGroup="Small",
SearchIndex="Books", IdType="ISBN")
response = response.decode('utf-8')
soup = BeautifulSoup(response, "lxml")
result = []
if soup:
for similarproduct in soup.findAll('item')[:self.max_width]:
similar_asin = similarproduct.find('asin').text
similar_title = similarproduct.find('title').text
self.title_dict[similar_asin] = similar_title
result.append(similar_asin)
#print (asin, result)
return result
def get_node_label(self, asin):
return self.title_dict[asin]
def get_node_href(self, asin):
return 'http://www.amazon.co.jp/dp/' + str(asin)
def set_graph_attr(self):
self.G.graph_attr.update(layout = 'sfdp', rankdir='LR', concentrate = 'true', overlap = 'false', splines = 'true')
def df(self):
'''
データフレーム(表)に書き出すメソッド
'''
#スーパークラスのメソッドをそのまま使う
df = super(AmazonRelateMap, self).df()
#カラム名をつけかえ
df.columns = ['title', 'url','asin']
df['asin'] = df['url'].str.replace('http://www.amazon.co.jp/dp/','')
return df
# ### Amazon商品情報による関連マップ (商品画像付き)
# In[142]:
class AmazonPictureRelateMap(AmazonRelateMap):
def image_download(self, asin, url):
'''
表紙イメージを (asin).jpgという名前でダウンロード
'''
with open(asin + '.jpg', "wb") as f:
r = requests.get(url, headers=headers)
f.write(r.content)
def get_image_url(self, asin):
items = item_search(asin, item_page = 1)
if items and items[0].find('largeimage'):
return items[0].find('largeimage').find('url').text
else:
return ''
def get_node_image(self, asin):
url = self.get_image_url(asin)
if url:
self.image_download(asin, url)
return asin + '.jpg'
else:
return ''
def set_nodes_attr(self):
for n in self.nodes:
print (',', end = '')
nlabel = self.get_node_label(n)
nxlabel = self.get_node_xlabel(n)
nshape = self.get_node_shape(n)
nhref = self.get_node_href(n)
nimage = self.get_node_image(n)
if nimage:
#self.G.node(n, label = nlabel, shape = nshape, href = nhref, image =nimage, labelloc='b')
self.G.node(n, label = '', shape = nshape, href = nhref, image =nimage)
else:
self.G.node(n, label = nlabel, shape = nshape, href = nhref)
self.result_df = self.result_df.append(pd.DataFrame({'label': [nlabel], 'xlabel': [nxlabel], 'url' : [nhref]}))
# ### コトバンク関連キーワードによる関連マップ
# In[143]:
class KotobankRelateMap(RelateMap):
start_node = ''
title_dict = dict()
def get_start_node(self):
self.start_node ='/word/' + urllib.parse.quote(self.keyword)
return self.start_node,
def get_nextnodes(self, parturl):
'''
urlを受けて関連語のurlのリストを返す
'''
#print parturl
result = []
#url = 'https://kotobank.jp/word/%E7%B5%8C%E6%B8%88%E4%BA%88%E6%B8%AC-58788'
r = requests.get('https://kotobank.jp' + parturl)
tree = lxml.html.fromstring(r.content)
this_title_tree = tree.xpath('//*[@id="mainTitle"]/h1/b')
#print this_title_tree[0].text
if this_title_tree:
self.title_dict[parturl] = this_title_tree[0].text
else:
return None
links = tree.xpath('//*[@id="retailkeyword"]/a')
for link in links:
similar_title = link.text
similar_url = link.attrib['href'] # /word/%E4%BA%88%E6%B8%AC-406123
self.title_dict[similar_url] = similar_title
result.append(similar_url)
return result
def get_node_href(self, parturl):
return 'https://kotobank.jp' + parturl
def get_node_label(self, parturl):
return self.title_dict.get(parturl)
def set_nodes_attr(self):
super(KotobankRelateMap, self).set_nodes_attr()
self.G.node(self.start_node, label = self.keyword, shape = 'circle')
# ### Weblio関連用語による関連マップ
# In[157]:
class WeblioRelateMap(RelateMap):
start_node = ''
title_dict = dict()
def get_start_node(self):
self.start_node = urllib.parse.quote(self.keyword)
return self.start_node,
def get_nextnodes(self, url):
'''
urlを受けて関連語のurlのリストを返す
'''
#print (url)
result = []
# http://www.weblio.jp/content/%E4%BA%88%E6%B8%AC
r = requests.get('http://www.weblio.jp/content/' + url)
tree = lxml.html.fromstring(r.content)
this_title_tree = tree.xpath('//*[@id="topicWrp"]/span' )
#print this_title_tree[0].text
if this_title_tree:
self.title_dict[url] = this_title_tree[0].text
else:
return None
links = tree.xpath('//div[@class="sideRWordsL"]/a')
for link in links:
similar_title = link.text
similar_url = link.attrib['href'].replace('http://www.weblio.jp/content/','')
self.title_dict[similar_url] = similar_title
result.append(similar_url)
#print (result)
return result
def get_node_href(self, url):
return url
def get_node_label(self, url):
return self.title_dict.get(url)
def set_nodes_attr(self):
super(WeblioRelateMap, self).set_nodes_attr()
self.G.node(self.start_node, label = self.keyword, shape = 'circle')
# ## エクステンションで独自のマジックコマンドを定義
# In[64]:
#selectindexのための補助関数
def string2index(nowlist):
resultlist = []
for item in nowlist:
item = item.strip()
if item.isdigit():
item = int(item)
resultlist.append(item)
else:
if ':' in item:
innrt_item = item.split(':')
if innrt_item[0].isdigit() and innrt_item[1].isdigit():
item = range(int(innrt_item[0]), int(innrt_item[1])+1)
resultlist.extend(item)
return resultlist
# The class MUST call this class decorator at creation time
@magics_class
class MyMagics(Magics):
def line2value(self, line):
now_user_namespace = self.shell.user_ns
if line in now_user_namespace:
return now_user_namespace[line]
else:
return line
def line2df(self, f, line):
now_user_namespace = self.shell.user_ns
if line in now_user_namespace:
now_variable = now_user_namespace[line]
if 'DataFrame' in str(type(now_variable)):
return f(now_variable)
else:
return line + ' is not DataFrame, but ' + str(type(now_variable))
else:
return line + ' is not defined.'
@line_magic
def epwing(self, line):
e = epwing_info(self.line2value(line))
df = e.collector()
return df
@line_magic
def kotobank(self, line):
k = Kotobank_info(self.line2value(line))
df = k.collector()
return df
@line_magic
def dictj(self, line):
line = self.line2value(line)
e = epwing_info(line)
df = e.collector()
df['url'] = '(epwing)'
k = Kotobank_info(line)
df = df.append(k.collector())
w = Wikipedia_Info(line, 'ja')
df = df.append(w.fetch_page())
df = df.sort_values(by='des_size') #並べ替え
df.index = range(1, len(df)+1) #最後にインデックスつけ
return df
@line_magic
def encyclopediacom(self, line):
k = encyclopediacom_info(self.line2value(line))
df = k.collector()
return df
#en English
@line_magic
def wikipedia_e(self, line):
w = Wikipedia_Info(str(self.line2value(line)),'en')
return w.fetch_page()
#simple Simple English
@line_magic
def wikipedia_s(self, line):
w = Wikipedia_Info(str(self.line2value(line)),'simple')
return w.fetch_page()
#de Deutsch
@line_magic
def wikipedia_de(self, line):
w = Wikipedia_Info(str(self.line2value(line)),'de')
return w.fetch_page()
#fr français
@line_magic
def wikipedia_fr(self, line):
w = Wikipedia_Info(str(self.line2value(line)),'fr')
return w.fetch_page()
#es español
@line_magic
def wikipedia_es(self, line):
w = Wikipedia_Info(str(self.line2value(line)),'es')
return w.fetch_page()
#ru русский
@line_magic
def wikipedia_fr(self, line):
w = Wikipedia_Info(str(self.line2value(line)),'r')
return w.fetch_page()
@line_magic
def wikipedia_j(self, line):
w = Wikipedia_Info(self.line2value(line),'ja')
return w.fetch_page()
@line_magic
def ndl(self, line):
w = ndl_list_info(self.line2value(line))
df = w.collector()
return df
@line_magic
def cinii_list(self, line):
w = CiNii_list_info(self.line2value(line))
df = w.collector()
return df
@line_magic
def webcat(self, line):
'''
webcat plus minusを使って書籍等を検索して、結果をデータフレームにする
'''
#print (self.line2value(line))
w = webcat_list_info(self.line2value(line))
df = w.collector()
return df
@line_magic
def worldcat(self, line):
w = Worldcat_list_info(self.line2value(line))
df = w.collector()
return df
@line_magic
def reflist(self, line):
r = refcases_list_info(self.line2value(line))
df = r.collector()
return df
@line_magic
def refcase(self, line):
'''
レファレンス協同データベースを検索して、データフレームを返す
書式:
%refcase (keyword),[num]
keyword:検索語(必須)
num: 最大表示件数(デフォルト10件)
例:角倉了以に関するレファレンス事例を10件分表示する
%refcase 角倉了以
例:角倉了以に関するレファレンス事例を5件分表示する
%refcase 角倉了以,5
'''
line_items = self.line2valuelist(line)
if len(line_items) > 1:
now_max_num = int(line_items[1])
else:
now_max_num = 10
r = Refcases_info(line_items[0], now_max_num)
df = r.collector()
df = pd.concat([df.ix[:,5:12], df.ix[:,20:21]], axis=1)
return df
@line_magic
def eb(self, line):
'''
データフレームを全件表示、URLはリンクかしてHTMLに埋め込んで表示
書式:
%eb df
例:直近の結果であるデータフレームを埋め込む
%eb _ 
例:Out[3]の結果であるデータフレームを埋め込む
%eb _3
'''
return self.line2df(embed_df, line)
@line_magic
def html2df(self, line):
'''
HTMLに埋め込んだデータからデータフレームに変換する
'''
now_user_namespace = self.shell.user_ns
if line in now_user_namespace:
now_variable = now_user_namespace[line]
if 'IPython.core.display.HTML' in str(type(now_variable)):
return pd.read_html(io.StringIO(now_variable.data), index_col =0, header=0)[0]
else:
return line + ' is not HTML, but ' + str(type(now_variable))
else:
return line + ' is not defined.'
#書誌データフレームの目次を複数の列に展開する
@line_magic
def expand_content(self,line):
'''
データフレームに含まれる目次項目を展開する
書式:
%exp_cont df
例:直近の結果であるデータフレームに含まれる目次項目を展開する
%exp_cont _ 
例:Out[3]の結果であるデータフレームに含まれる目次項目を展開する
%exp_cont _3
'''
now_user_namespace = self.shell.user_ns
if line in now_user_namespace:
df = now_user_namespace[line]
if 'DataFrame' in str(type(df)):
if 'contents' in df.columns:
split_contents_df = df['contents'].str.split('\t', n=0, expand=True).fillna(value='')
return pd.concat([df[['title','summary']], split_contents_df],axis=1)
else:
return df
else:
print (line, 'is not DataFrame, but ', type(now_variable))
@line_magic
def extbook(self, line):
'''
データフレームに含まれる書名(『書名』こういう形式のもの)を抜き出してデータフレームにして返す
書式:
%extbook df
例:直近の結果であるデータフレームに含まれる書名を抜き出してデータフレームに
%extbook _ 
例:Out[3]の結果であるデータフレームに含まれる書名を抜き出してデータフレームに
%extbook _3
'''
now_user_namespace = self.shell.user_ns
if line in now_user_namespace:
now_variable = now_user_namespace[line]
if 'DataFrame' in str(type(now_variable)):
return ext_book(now_variable)
else:
print (line, 'is not DataFrame, but ', type(now_variable))
@line_magic
def amazonreviews(self, line):
'''
isbnが含まれるデータフレームを受け取り、アマゾンレビューのデータフレームを返す
書式:
%reviews df
例:直近の結果であるデータフレームからisbnを受け取りアマゾンレビューのデータフレームを返す
%reviews _ 
例:Out[3]の結果であるデータフレームからisbnを受け取りアマゾンレビューのデータフレームを返す
%reviews _3
'''
now_user_namespace = self.shell.user_ns
if line in now_user_namespace:
now_variable = now_user_namespace[line]
if 'DataFrame' in str(type(now_variable)):
a = amazon_reviews(now_variable)
return a.collector()
else:
print (line, 'is not DataFrame, but ', type(now_variable))
@line_magic
def stocks_amazon(self, line):
'''
書誌データフレームを受け、アマゾンの在庫と最安値の一覧表をデータフレームで返す
返り値:DataFrame
書式: %stocks_amazon biblio_df ……
例:out[1]の書誌データフレームに出てくる本を探しアマゾンの在庫と最安値の一覧表をデータフレームで返す
%stocks_amazon _1
'''
now_user_namespace = self.shell.user_ns
if line in now_user_namespace:
now_variable = now_user_namespace[line]
if 'DataFrame' in str(type(now_variable)):
return amazon_stocks(now_variable)
else:
print (line, 'is not DataFrame, but ', type(now_variable))
@line_magic
def stocks_kosho(self, line):
'''
書誌データフレームを受け、『日本の古本屋』に出品している書籍、出品店、値段などをデータフレームで返す
返り値:DataFrame
書式: %stocks_amazon biblio_df ……
例:out[1]の書誌データフレームに出てくる本を探し出品している書籍、出品店、値段などをデータフレームで返す
%stocks_kosho _1
'''
now_user_namespace = self.shell.user_ns
if line in now_user_namespace:
now_variable = now_user_namespace[line]
if 'DataFrame' in str(type(now_variable)):
return kosho_stocks(now_variable)
else:
print (line, 'is not DataFrame, but ', type(now_variable))
@line_magic
def stocks_ndl(self, line):
'''
書誌データフレームを受け、
国会図書館、各県立図書館の所蔵/貸し出し状況/予約URLについてデータフレームにまとめて返す
返り値:DataFrame
書式: %stocks_ndl biblio_df
例:out[1]の書誌データフレームに出てくる本を探し
国会図書館、各県立図書館の所蔵/貸し出し状況/予約URLについてデータフレームにまとめて返す
%stocks_ndl _1
'''
now_user_namespace = self.shell.user_ns
if line in now_user_namespace:
now_variable = now_user_namespace[line]
if 'DataFrame' in str(type(now_variable)):
return ndl_stocks(now_variable)
else:
print (line, 'is not DataFrame, but ', type(now_variable))
@line_magic
def ndlcollection(self, line):
'''
書誌データフレームを受け、
国会図書館デジタルコレクション、国会図書館複写サービス、外部リポジトリCiNii, JAIRO等へのリンク
をまとめた表をデータフレームで返す
返り値:DataFrame
書式: %ndlcollection biblio_df
例:out[1]の書誌データフレームに出てくる本を探し
デジタルコレクション、国会図書館複写サービス、外部リポジトリCiNii, JAIRO等へのリンク
をまとめた表をデータフレームで返す
%ndlcollection _1
'''
now_user_namespace = self.shell.user_ns
if line in now_user_namespace:
now_variable = now_user_namespace[line]
if 'DataFrame' in str(type(now_variable)):
return ndl_collection(now_variable)
else:
print (line, 'is not DataFrame, but ', type(now_variable))
@line_magic
def librarystatus(self, line):
'''
isbnを含むbiblioデータフレームと図書館リストのデータフレームを渡して、
isbn、タイトル、所蔵/貸し出し状況/予約URLについてデータフレームにまとめて返す
返り値:DataFrame
書式: %librarystatus biblio_df, library_df
biblio_df %makebibでできるi書誌情報のデータフレーム
library_df %librarylistでできる図書館リストのデータフレーム(デフォルトはMYLIBRARY)
例:Out[1],Out[3],Out[7]のブックリストをまとめてデータフレームb_dfをつくり、
「淀屋橋駅」付近の図書館の所蔵/貸し出し状況/予約URLのデータフレームをつくる
b_df = %makebib _1, _3, _7
l_df = %librarylist 淀屋橋駅
%librarystatus b_df, l_df
'''
now_user_namespace = self.shell.user_ns
if len(line.split(',')) == 2:
bib_df_name = line.split(',')[0].strip()
lib_df_name = line.split(',')[1].strip()
if bib_df_name in now_user_namespace.keys() and lib_df_name in now_user_namespace.keys():
bib_df = now_user_namespace[bib_df_name]
lib_df = now_user_namespace[lib_df_name]
if 'DataFrame' in str(type(bib_df)) and 'DataFrame' in str(type(lib_df)):
return library_status(bib_df, lib_df)
else:
print ('not DataFrame')
return
else:
print (item, 'not defined.')
return
elif len(line.split(',')) == 1:
bib_df_name = line.split(',')[0].strip()
if bib_df_name in now_user_namespace.keys():
bib_df = now_user_namespace[bib_df_name]
if 'DataFrame' in str(type(bib_df)):
return library_status(bib_df)
else:
print ('not DataFrame')
return
else:
print (item, 'not defined.')
return
#選択系ツール
@line_magic
def selectstr(self, line):
'''
与えたデータフレームから、コラム名で指定した列(コラム/フィールド)に検索文字列が含まれるものを抜き出す
返り値:DataFrame
書式:
%selectstr DataFrame, column_name, search_string
DataFrame データフレーム
column_name コラム(フィールド)
search_string 検索文字列
例:直近の出力結果(データフレーム)から、titleフィールドに「文化祭」が含まれているものを抜き出す
%selectstr _, title, 文化祭
'''
now_user_namespace = self.shell.user_ns
df_name = line.split(',')[0].strip()
column_name = line.split(',')[1].strip()
search_string = line.split(',')[2].strip()
if df_name in now_user_namespace.keys():
df = now_user_namespace[df_name]
if 'DataFrame' in str(type(df)):
return df[df[column_name].str.contains(search_string)]
else:
print (df_name, 'is not DataFrame, but ', type(df))
return
else:
print (df_name, 'is not defined.')
return
@line_magic
def selectcolumn(self, line):
'''
与えたデータフレームから、コラム名で指定した列(コラム/フィールド)を抜き出す
返り値:DataFrame
書式
%selectcolumn DataFrame, column_name1, column_name2, column_name3, ....
DataFrame データフレーム
column_name1 コラム名その1
column_name2 コラム名その2
……
例:直近の結果(データフレーム)から「title」「summary」「contents」の列を抜き出す
%selectcolumn _94, 'title','summary','contents'
'''
now_user_namespace = self.shell.user_ns
df_name = line.split(',')[0].strip()
column_names = line.split(',')[1:]
select_columns = [col.strip() for col in column_names]
if df_name in now_user_namespace.keys():
df = now_user_namespace[df_name]
if 'DataFrame' in str(type(df)):
return df[select_columns]
else:
print (df_name, 'is not DataFrame, but ', type(df))
return
else:
print (df_name, 'is not defined.')
return
@line_magic
def selectindex(self, line):
'''
与えたデータフレームから、index番号で指定された行を抜き出す
返り値:DataFrame
書式
%selectindex DataFrame, index_num1, index_num2, ....
DataFrame データフレーム
index_num1 index番号その1
index_num2 index番号その2
……
例:直近の結果(データフレーム)から1,4〜7,10行目を抜き出す
%selectcol _94, 1,4:7,10
'''
now_user_namespace = self.shell.user_ns
df_name = line.split(',')[0].strip()
index_names = line.split(',')[1:]
#select_indexs = [int(idx) for idx in index_names]
select_indexs = string2index(index_names)
#print (select_indexs)
if df_name in now_user_namespace.keys():
df = now_user_namespace[df_name]
if 'DataFrame' in str(type(df)):
return df.query("index == {}".format(select_indexs))
else:
print (df_name, 'is not DataFrame, but ', type(df))
return
else:
print (df_name, 'is not defined.')
return
@line_magic
def makebib(self, line):
'''
ブックリスト系のデータフレームを複数受け重複を除き、webcat詳細ページから得た情報で書誌データフレームを完成させる
返り値:DataFrame
書式: %makebib df1, df2, df3, ……
例:out[1]とout[3]とout[7]の書誌情報(isbn, webcatのurl、titleなど)を含むデータフレームを統合し
書誌データフレームを完成させる
%makebib _1, _3, _7
'''
now_user_namespace = self.shell.user_ns
#print (list(now_user_namespace.keys()))
now_list = []
for item in line.split(','):
item = item.strip()
if item in now_user_namespace.keys():
now_variable = now_user_namespace[item]
if 'DataFrame' in str(type(now_variable)):
now_list.append(now_variable)
else:
print (item, 'is not DataFrame, but ', type(now_variable))
return
else:
print (item, 'is not defined.')
return
#print (type(now_list))
return make_biblio(now_list)
@line_magic
def concatdf(self, line):
'''
データフレームを複数受けて、マージする
返り値:DataFrame
書式: %margedf df1, df2, df3, ……
'''
now_user_namespace = self.shell.user_ns
#print (list(now_user_namespace.keys()))
now_list = []
for item in line.split(','):
item = item.strip()
if item in now_user_namespace.keys():
now_variable = now_user_namespace[item]
if 'DataFrame' in str(type(now_variable)):
now_list.append(now_variable)
else:
print (item, 'is not DataFrame, but ', type(now_variable))
return
else:
print (item, 'is not defined.')
return
#print (type(now_list))
return pd.concat(now_list)
@line_magic
def df2csv(self, line):
'''
データフレームをcsvに変換する
'''
now_user_namespace = self.shell.user_ns
if line in now_user_namespace:
now_variable = now_user_namespace[line]
if 'DataFrame' in str(type(now_variable)):
output = io.StringIO()
now_variable.to_csv(output, index=False, encoding='utf-8')
print (output.getvalue())
return output.getvalue()
else:
print (line, 'is not DataFrame, but ', type(now_variable))
@line_magic
def kosho(self, line):
k = Kosho_info(self.line2value(line))
df = k.collector()
return df
@line_magic
def koshotitle(self, line):
k = Kosho_info_title(self.line2value(line))
df = k.collector()
return df
@line_magic
def google(self, line):
w = Google_info(self.line2value(line))
df = w.collector()
return df
@line_magic
def googlebook(self, line):
w = Google_book_info(self.line2value(line))
df = w.collector()
return df
@line_magic
def googlescholar(self, line):
w = Google_scholar_info(self.line2value(line))
df = w.collector()
return df
@line_magic
def LibraryGenesisArticles(self, line):
w = LibraryGenesisArticles(self.line2value(line))
df = w.collector()
return df
@line_magic
def thememap(self, line):
'''
WIkipediaのカテゴリー情報をつかってテーママップ(概念階層図)を描く
書式:%thememap keyword
例:   %thememap 深層学習
'''
m = Thememap(self.line2value(line))
G = m.draw()
return G
@line_magic
def suggestmap(self, line):
'''
googleサジェストをつかって検索語の共起語を得てマップを描く
書式:
%suggestmap keyword [,1 or 2]
例:ノーマルモードで検索語の共起語を得てマップを描く
%suggestmap 深層学習
例:エキスパンドモードでより網羅的に検索語の共起語を得てマップを描く
%suggestmap 深層学習,1
例:スーパーモードでさらに網羅的に検索語の共起語を得てマップを描く
%suggestmap 深層学習,2
'''
split_line = line.split(',')
if len(split_line) == 1:
print ('normal mode')
m = SuggestMap(str(self.line2value(line)),0)
g = m.draw()
return g
elif len(split_line) == 2 and split_line[1] in '1':
print ('expand mode')
m = SuggestMap(str(self.line2value(split_line[0])),1)
g = m.draw()
return g
elif len(split_line) == 2 and split_line[1] in '2':
print ('super mode')
m = SuggestMap(str(self.line2value(split_line[0])),2)
g = m.draw()
return g
else:
print ('mode error - see %suggestmap?')
def line2valuelist(self, lines):
return [self.line2value(line) for line in lines.split(',')]
@line_magic
def wikipedia(self, line):
'''
言語を指定してWikipediaを検索する
書式:
%wikipedia keyword, langueage
keyword: 検索語(必須)
langueage : 言語指定(ja, en, de, ...)
'''
line_items = self.line2valuelist(line)
#print (line_items)
keyword = line_items[0].strip()
language = line_items[1].strip()
print (keyword, language)
w = Wikipedia_Info(keyword, language)
return w.fetch_page()
@line_magic
def amazonmap(self, line):
'''
Amazon Product Advertising APIを使って関連商品(Similarities)の関連図をつくる
書式:
%amazonmap keyword, depth, start_num
keyword: 検索語(必須)
depth: 探索の深さ(デフォルト値 2)
width: 探索の広がり (デフォルト値 5)
使用例
%amazonmap 文化祭企画
%amazonmap 深層学習, 3, 3
'''
line_items = self.line2valuelist(line)
print (line_items)
if len(line_items) > 1:
now_max_level = int(line_items[1])
else:
now_max_level = 1
if len(line_items) > 2:
now_start_num= int(line_items[2])
else:
now_start_num = 5
return AmazonRelateMap(str(line_items[0]), max_depth=now_max_level, max_width = now_start_num)
@line_magic
def amazonPmap(self, line):
'''
Amazon Product Advertising APIを使って関連商品(Similarities)の関連図(表紙画像)をつくる
書式:
%amazonmap keyword, depth, start_num
keyword: 検索語(必須)
depth: 探索の深さ(デフォルト値 2)
width: 探索の広がり (デフォルト値 5)
使用例
%amazonPmap 文化祭企画
%amazonPmap 深層学習, 3, 3
'''
line_items = self.line2valuelist(line)
if len(line_items) > 1:
now_max_level = int(line_items[1])
else:
now_max_level = 1
if len(line_items) > 2:
now_start_num= int(line_items[2])
else:
now_start_num = 5
return AmazonPictureRelateMap(str(line_items[0]), max_depth=now_max_level, max_width = now_start_num)
@line_magic
def kotobankmap(self, line):
'''
kotobankの関連キーワードをつかって関連図をつくる
書式:
%kotobankmap keyword, depth
keyword: 検索語(必須)
depth: 探索の深さ(デフォルト値 2)
使用例
%kotobankmap 文化祭企画
%kotobankmap 深層学習, 2
'''
line_items = self.line2valuelist(line)
if len(line_items) > 1:
now_max_level = int(line_items[1])
else:
now_max_level = 2
print (line_items[0])
k = KotobankRelateMap(str(line_items[0]), max_depth=now_max_level, max_width = 0)
return k.g()
@line_magic
def webliomap(self, line):
'''
Weblioの関連キーワードをつかって関連図をつくる
書式:
%webliomap keyword, depth
keyword: 検索語(必須)
depth: 探索の深さ(デフォルト値 2)
使用例
%webliomap 文化祭企画
%webliomap 深層学習, 3
'''
line_items = self.line2valuelist(line)
if len(line_items) > 1:
now_max_level = int(line_items[1])
else:
now_max_level = 2
w = WeblioRelateMap(str(line_items[0]), max_depth=now_max_level, max_width = 0)
return w.g()
@line_magic
def librarylist(self, line):
'''
地名のリストを渡して付近の図書館のリスト(館名、住所、電話、geocode、URLなど)のデータフレームを返す
返り値:DataFrame
書式
%librarylist address1, address2, address3, ....
DataFrame データフレーム
index_number1 行(index)番号その1
index_number2 行(index)番号その2
……
例:立ち寄り先3箇所(京都市北区 大阪市西区 神戸市中央区)の付近の
図書館のリスト(館名、住所、電話、geocode、URLなど)のデータフレームを返す
%librarylist 京都市北区, 大阪市西区, 神戸市中央区
'''
now_list = self.line2valuelist(line)
return library_list([i for i in now_list])
@line_magic
def extamazon(self, line):
'''
urlで指定されるページからamazon.co.jpへのリンクを抽出してtitle, asinのデータフレームを返す
'''
return ext_amazon(self.line2value(line))
@line_magic
def interlang(self, line):
'''
wikipediaの「他の言語へのリンク」を使って各語の訳語をデータフレームで返す
'''
return interlanguage(self.line2value(line))
# In order to actually use these magics, you must register them with a
# running IPython. This code must be placed in a file that is loaded once
# IPython is up and running:
ip = get_ipython()
# You can register the class itself without instantiating it. IPython will
# call the default constructor on it.
ip.register_magics(MyMagics)
#ここで登録したマジックコマンドは、load_ipython_extensionしなくていい
# In[51]:
def load_ipython_extension(ipython):
ipython.register_magic_function(text2bib, 'cell')
ipython.register_magic_function(csv2df, 'cell')
ipython.register_magic_function(csv2dot, 'cell')
ipython.register_magic_function(indent2dot, 'cell')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment