Created
September 23, 2018 15:23
-
-
Save gaxiiiiiiiiiiii/e9d015a155f3628573d14427d36d123b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib.request import urlopen | |
import json | |
from googlesearch import search | |
from bs4 import BeautifulSoup as bs | |
from gensim.models import word2vec | |
import numpy as np | |
from gensim.models import KeyedVectors | |
import MeCab | |
import pandas as pd | |
from datetime import datetime | |
def get_recent_article(date_span=7): | |
# initialize | |
article_api = "https://alis.to/api/articles/recent?limit=100&page=%d" | |
page = 1 | |
result = [] | |
# データ取得 | |
while True: | |
raw_article_data = urlopen(article_api % page).read().decode("utf-8") | |
json_article_data = json.loads(raw_article_data) | |
articles_data = json_article_data["Items"] | |
result.extend(articles_data) | |
# ループ終了のチェック | |
today = datetime.now() | |
last_data_date = datetime.fromtimestamp(int(result[-1]["published_at"])) | |
if (today-last_data_date).days >= (date_span+1): | |
return result | |
else: | |
page += 1 | |
def get_original_text(original_url): | |
# https://coinsforest.com/の記事から本文の抽出 | |
html = urlopen(original_url).read().decode("utf-8") | |
soup = bs(html,"html5lib",from_encoding="utf-8") | |
original_text = soup.find("div", attrs={"class":"post_content"}).text.replace("\n","") | |
return original_text | |
def get_alis_text(article_id): | |
# ALIS記事から本文の抽出 | |
alis_api = "https://alis.to/api/articles/%s" | |
raw_article_data = urlopen(alis_api % article_id).read().decode("utf-8") | |
json_article_data = json.loads(raw_article_data) | |
body = json_article_data["body"] | |
alis_text = bs(body,"html5lib",from_encoding="utf-8").text.replace("\n","").replace(" ", "").replace("\u3000","") | |
return alis_text | |
def _generate_get_vector(): | |
# http://www.cl.ecei.tohoku.ac.jp/~m-suzuki/jawiki_vector/から | |
# 学習済みデータをダウンロードして利用(entity_vector.model.bin) | |
mt = MeCab.Tagger('') | |
mt.parse('') | |
model = KeyedVectors.load_word2vec_format("entity_vector.model.bin", binary=True) | |
# テキストのベクトルを計算 | |
def get_vector(text): | |
sum_vec = np.zeros(200) | |
word_count = 0 | |
node = mt.parseToNode(text) | |
while node: | |
fields = node.feature.split(",") | |
# 名詞、動詞、形容詞に限定 | |
if fields[0] == '名詞' or fields[0] == '動詞' or fields[0] == '形容詞': | |
try: | |
sum_vec += model.wv[node.surface] | |
word_count += 1 | |
except: | |
pass | |
node = node.next | |
return sum_vec / word_count | |
return get_vector | |
def cos_sim(v1, v2): | |
return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) | |
def get_similarity(origin,alis): | |
origin_vec = get_vector(origin) | |
alis_vec = get_vector(alis) | |
similarity = cos_sim(origin_vec, alis_vec) | |
return similarity | |
if __name__ == "__main__": | |
get_vector = _generate_get_vector() | |
news_site_url = "https://coinsforest.com/" | |
articles = get_recent_article(3) | |
result = create_data(articles,news_site_url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment