Skip to content

Instantly share code, notes, and snippets.

@yumaueno
Created August 16, 2019 10:22
Show Gist options
  • Save yumaueno/f837d783adbac0ed5ce8edfb726ffd6c to your computer and use it in GitHub Desktop.
Save yumaueno/f837d783adbac0ed5ce8edfb726ffd6c to your computer and use it in GitHub Desktop.
類似度算出を行ってデータベース格納
import requests
from bs4 import BeautifulSoup
import sys
import MeCab
from time import sleep
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import sqlite3
def geturl(urls):
all_text=[]
for url in urls:
r=requests.get(url)
c=r.content
soup=BeautifulSoup(c,"html.parser")
article1_content=soup.find_all("p")
temp=[]
for con in article1_content:
out=con.text
temp.append(out)
text=''.join(temp)
all_text.append(text)
sleep(1)
return all_text
def mplg(article):
word_list = ""
m=MeCab.Tagger()
m1=m.parse (text)
for row in m1.split("\n"):
word =row.split("\t")[0]#タブ区切りになっている1つ目を取り出す。ここには形態素が格納されている
if word == "EOS":
break
else:
pos = row.split("\t")[1]#タブ区切りになっている2つ目を取り出す。ここには品詞が格納されている
slice = pos[:2]
if slice == "名詞":
word_list = word_list +" "+ word
return word_list
def tfidf(word_list):
docs = np.array(word_list)#Numpyの配列に変換する
#単語を配列ベクトル化して、TF-IDFを計算する
vecs = TfidfVectorizer(
token_pattern=u'(?u)\\b\\w+\\b'#文字列長が 1 の単語を処理対象に含めることを意味します。
).fit_transform(docs)
vecs = vecs.toarray()
return vecs
def cossim(v1,v2):
return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
##データベースの
class Db():
def __init__(self,dbname):
self.db=dbname
def db_input(self,article):
#値をデータベースに格納
conn=sqlite3.connect(self.db)
c = conn.cursor()
# executeメソッドでSQL文を実行する
create_table = '''DROP TABLE IF EXISTS article_match;
create table article_match (text_1 verchar,text_2 verchar,match_rate double)'''
c.executescript(create_table)
# セットしたい場所に?を記述し,executeメソッドの第2引数に?に当てはめる値を
# タプルで渡す.
sql = 'insert into article_match (text_1, text_2, match_rate) values (?,?,?)'
c.execute(sql, article)
conn.commit()
c.close()
conn.close()
def db_output(self):
#データベースから値を抽出
conn=sqlite3.connect(self.db)
c = conn.cursor()
select_sql = 'select * from article_match'
c.execute(select_sql)
match_rate=c.fetchone()
conn.commit()
c.close()
conn.close()
return match_rate
word_list=[]
url=["https://toukei-lab.com/conjoint","https://toukei-lab.com/correspondence"]
texts=geturl(url)
for text in texts:
word_list.append(mplg(text))
vecs=tfidf(word_list)
match_rate=cossim(vecs[1],vecs[0])
article = (url[0], url[1], match_rate)
dbname = 'article.db'
db=Db(dbname)
db.db_input(article)
article=db.db_output()
article[2]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment