Skip to content

Instantly share code, notes, and snippets.

@gaxiiiiiiiiiiii
Created September 23, 2018 15:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gaxiiiiiiiiiiii/e9d015a155f3628573d14427d36d123b to your computer and use it in GitHub Desktop.
Save gaxiiiiiiiiiiii/e9d015a155f3628573d14427d36d123b to your computer and use it in GitHub Desktop.
from urllib.request import urlopen
import json
from googlesearch import search
from bs4 import BeautifulSoup as bs
from gensim.models import word2vec
import numpy as np
from gensim.models import KeyedVectors
import MeCab
import pandas as pd
from datetime import datetime
def get_recent_article(date_span=7):
# initialize
article_api = "https://alis.to/api/articles/recent?limit=100&page=%d"
page = 1
result = []
# データ取得
while True:
raw_article_data = urlopen(article_api % page).read().decode("utf-8")
json_article_data = json.loads(raw_article_data)
articles_data = json_article_data["Items"]
result.extend(articles_data)
# ループ終了のチェック
today = datetime.now()
last_data_date = datetime.fromtimestamp(int(result[-1]["published_at"]))
if (today-last_data_date).days >= (date_span+1):
return result
else:
page += 1
def get_original_text(original_url):
# https://coinsforest.com/の記事から本文の抽出
html = urlopen(original_url).read().decode("utf-8")
soup = bs(html,"html5lib",from_encoding="utf-8")
original_text = soup.find("div", attrs={"class":"post_content"}).text.replace("\n","")
return original_text
def get_alis_text(article_id):
# ALIS記事から本文の抽出
alis_api = "https://alis.to/api/articles/%s"
raw_article_data = urlopen(alis_api % article_id).read().decode("utf-8")
json_article_data = json.loads(raw_article_data)
body = json_article_data["body"]
alis_text = bs(body,"html5lib",from_encoding="utf-8").text.replace("\n","").replace(" ", "").replace("\u3000","")
return alis_text
def _generate_get_vector():
# http://www.cl.ecei.tohoku.ac.jp/~m-suzuki/jawiki_vector/から
# 学習済みデータをダウンロードして利用(entity_vector.model.bin)
mt = MeCab.Tagger('')
mt.parse('')
model = KeyedVectors.load_word2vec_format("entity_vector.model.bin", binary=True)
# テキストのベクトルを計算
def get_vector(text):
sum_vec = np.zeros(200)
word_count = 0
node = mt.parseToNode(text)
while node:
fields = node.feature.split(",")
# 名詞、動詞、形容詞に限定
if fields[0] == '名詞' or fields[0] == '動詞' or fields[0] == '形容詞':
try:
sum_vec += model.wv[node.surface]
word_count += 1
except:
pass
node = node.next
return sum_vec / word_count
return get_vector
def cos_sim(v1, v2):
return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
def get_similarity(origin,alis):
origin_vec = get_vector(origin)
alis_vec = get_vector(alis)
similarity = cos_sim(origin_vec, alis_vec)
return similarity
if __name__ == "__main__":
get_vector = _generate_get_vector()
news_site_url = "https://coinsforest.com/"
articles = get_recent_article(3)
result = create_data(articles,news_site_url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment