Created
August 16, 2019 10:22
-
-
Save yumaueno/f837d783adbac0ed5ce8edfb726ffd6c to your computer and use it in GitHub Desktop.
類似度算出を行ってデータベース格納
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import sys | |
import MeCab | |
from time import sleep | |
import numpy as np | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
import sqlite3 | |
def geturl(urls): | |
all_text=[] | |
for url in urls: | |
r=requests.get(url) | |
c=r.content | |
soup=BeautifulSoup(c,"html.parser") | |
article1_content=soup.find_all("p") | |
temp=[] | |
for con in article1_content: | |
out=con.text | |
temp.append(out) | |
text=''.join(temp) | |
all_text.append(text) | |
sleep(1) | |
return all_text | |
def mplg(article): | |
word_list = "" | |
m=MeCab.Tagger() | |
m1=m.parse (text) | |
for row in m1.split("\n"): | |
word =row.split("\t")[0]#タブ区切りになっている1つ目を取り出す。ここには形態素が格納されている | |
if word == "EOS": | |
break | |
else: | |
pos = row.split("\t")[1]#タブ区切りになっている2つ目を取り出す。ここには品詞が格納されている | |
slice = pos[:2] | |
if slice == "名詞": | |
word_list = word_list +" "+ word | |
return word_list | |
def tfidf(word_list): | |
docs = np.array(word_list)#Numpyの配列に変換する | |
#単語を配列ベクトル化して、TF-IDFを計算する | |
vecs = TfidfVectorizer( | |
token_pattern=u'(?u)\\b\\w+\\b'#文字列長が 1 の単語を処理対象に含めることを意味します。 | |
).fit_transform(docs) | |
vecs = vecs.toarray() | |
return vecs | |
def cossim(v1,v2): | |
return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) | |
##データベースの | |
class Db(): | |
def __init__(self,dbname): | |
self.db=dbname | |
def db_input(self,article): | |
#値をデータベースに格納 | |
conn=sqlite3.connect(self.db) | |
c = conn.cursor() | |
# executeメソッドでSQL文を実行する | |
create_table = '''DROP TABLE IF EXISTS article_match; | |
create table article_match (text_1 verchar,text_2 verchar,match_rate double)''' | |
c.executescript(create_table) | |
# セットしたい場所に?を記述し,executeメソッドの第2引数に?に当てはめる値を | |
# タプルで渡す. | |
sql = 'insert into article_match (text_1, text_2, match_rate) values (?,?,?)' | |
c.execute(sql, article) | |
conn.commit() | |
c.close() | |
conn.close() | |
def db_output(self): | |
#データベースから値を抽出 | |
conn=sqlite3.connect(self.db) | |
c = conn.cursor() | |
select_sql = 'select * from article_match' | |
c.execute(select_sql) | |
match_rate=c.fetchone() | |
conn.commit() | |
c.close() | |
conn.close() | |
return match_rate | |
word_list=[] | |
url=["https://toukei-lab.com/conjoint","https://toukei-lab.com/correspondence"] | |
texts=geturl(url) | |
for text in texts: | |
word_list.append(mplg(text)) | |
vecs=tfidf(word_list) | |
match_rate=cossim(vecs[1],vecs[0]) | |
article = (url[0], url[1], match_rate) | |
dbname = 'article.db' | |
db=Db(dbname) | |
db.db_input(article) | |
article=db.db_output() | |
article[2] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment