uhfx/20210211-mtg.md

## 20210211-mtg.md

      
    Raw
  

              20210211-mtg.md
            
          
    app.py

すべての関数の読み込み
test7-2.py

all.py を利用したいつもの
test8.py

内容を絞るために2度質問してくる

  
## all.py
import pandas as pd
import MeCab
tagger = MeCab.Tagger("-Ochasen")
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

from collections import Counter
import collections
import itertools

def q_get(text_paths): # 質問文書を text に格納
    texts = []
    for text_path in text_paths:
        text = open(text_path, 'r').read()
        text = text.split(',') # CSV ファイルのセルで分割
        text = ' '.join(text[8:9]) # 質問文書部分
        text = text.replace( '\n' , '' ) # 質問文書の改行を削除
        text = text.strip('"') # CSV ファイルのセル " を削除
        # text = text.replace('する', '') # する できる の削除（不要？）
        # text = text.replace('できる', '')
        texts.append(text) # 配列 texts に格納

    return texts

def a_get(text_paths): # 回答文書を text に格納
    a_texts = []
    for text_path in text_paths:
        a_text = open(text_path, 'r').read()
        a_text = a_text.split(',') # CSV ファイルのセルで分割
        a_text = ' '.join(a_text[16:17]) # 質問文書部分
        # a_text = a_text.replace( '\n' , '' ) # 質問文書の改行を削除．読みにくいのでやっぱり不要
        a_text = a_text.strip('"') # CSV ファイルのセル " を削除
        a_texts.append(a_text) # 配列 a_texts に格納

    return a_texts

def load_stopwords(path="data/jp_stop_words.txt"): # ストップワードの読み込み 外部ファイル
    url = 'http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt'
    # if os.path.exists(path):
    #     print('ストップワードの読み込み完了')
    # else:
    #     print('ストップワードのダウンロード中')
    #     urllib.request.urlretrieve(url, path)
    return pd.read_csv(path, header=None)[0].tolist()

def preprocess(series, flags = ['名詞', '固有名詞', '動詞', '形容詞']): # 前処理
    stop_words = load_stopwords() # ストップワードの削除
    def tokenizer_func(text): # MeCab で名詞，動詞，形容動詞のみを残す処理する部分
        tokens = []
        node = tagger.parseToNode(str(text))
        while node:
            features = node.feature.split(',') # MeCab 辞書はコンマ区切りなので，コンマで分割
            surface = features[6] # MeCab 辞書の6番目の言葉の原型を抽出
            if (surface == '*') or (len(surface) < 2) or (surface in stop_words): # 知らない言葉は無視
                node = node.next
                continue

            if (features[0] == '名詞') & ('名詞' in flags): # MeCab での名詞かつ名詞という flag の付いたものを抽出
                tokens.append(surface)
            elif ((features[0] == '名詞') & (features[1] == '固有名詞')) & ('固有名詞' in flags): # MeCab での固有名詞かつ固有名詞という flag の付いたものを抽出
                tokens.append(surface)
            elif ((features[0] == '動詞') & (features[1] == '自立')) & ('動詞' in flags): # MeCab での動詞（自立）かつ動詞という flag の付いたものを抽出
                tokens.append(surface)
            elif ((features[0] == '形容詞') & (features[1] == '自立')) & ('形容詞' in flags): # MeCab での形容詞かつ形容詞という flag の付いたものを抽出
                tokens.append(surface)

            # noun_flag = (features[0] == '名詞')
            # proper_noun_flag = (features[0] == '名詞') & (features[1] == '固有名詞')
            # verb_flag = (features[0] == '動詞') & (features[1] == '自立')
            # adjective_flag = (features[0] == '形容詞') & (features[1] == '自立')
            # if proper_noun_flag:
            #     tokens.append(surface)
            # elif noun_flag:
            #     tokens.append(surface)
            # elif verb_flag:
            #     tokens.append(surface)
            # elif adjective_flag:
            #     tokens.append(surface)

            node = node.next
        return " ".join(tokens)

    series = series.map(tokenizer_func)

    #---------------Normalization-----------#
    series = series.map(lambda x: x.lower()) # 小文字に統一
    # series = series.map(mojimoji.zen_to_han, kana=False) # 半角に（カタカナ除く）統一．なんか動かないし不要．

    return series

# query_preprocess は不要．
# def query_preprocess(query_series): # 前処理
#     stop_words = load_stopwords() # ストップワードの削除
#     def tokenizer_func(text): # MeCab で名詞，動詞，形容動詞のみを残す処理する部分
#         tokens = []
#         node = tagger.parseToNode(str(text))
#         while node:
#             features = node.feature.split(',') # MeCab 辞書はコンマ区切りなので，コンマで分割
#             surface = features[6] # MeCab 辞書の6番目の言葉の原型を抽出
#             if (surface == '*') or (len(surface) < 2) or (surface in stop_words): # 知らない言葉は無視
#                 node = node.next
#                 continue
#             noun_flag = (features[0] == '名詞')
#             proper_noun_flag = (features[0] == '名詞') & (features[1] == '固有名詞')
#             verb_flag = (features[0] == '動詞') & (features[1] == '自立')
#             adjective_flag = (features[0] == '形容詞') & (features[1] == '自立')
#             if proper_noun_flag:
#                 tokens.append(surface)
#             elif noun_flag:
#                 tokens.append(surface)
#             elif verb_flag:
#                 tokens.append(surface)
#             elif adjective_flag:
#                 tokens.append(surface)
#             node = node.next
#         return " ".join(tokens)
#
#     query_series = query_series.map(tokenizer_func)
#     # query_series = tokenizer_func(query_series)
#
#     #---------------Normalization-----------#
#     query_series = query_series.map(lambda x: x.lower()) # 小文字に統一
#     # series = series.map(mojimoji.zen_to_han, kana=False) # 半角に（カタカナ除く）統一．なんか動かないし不要．
#     return query_series

def question_vector(series): # 質問文書を Tf-Idf を用いて数値化
    tfidf = TfidfVectorizer()
    question_vector = tfidf.fit_transform(series).toarray() #コサイン類似度の評価
    query_vector = tfidf.transform(query_series).toarray() # 入力された質問を Tf-Idf を用いて数値化
    # 複数の返り値 https://pg-chain.com/python-function-return#toc3
    return question_vector, query_vector

def get_cs(query_series, series): # 質問文書を MeCab で処理したあとのものをコサイン類似度を評価．
    tfidf = TfidfVectorizer() # Tf-Idf 化関数に名前を付ける
    question_vector = tfidf.fit_transform(series).toarray() # 質問文書を Tf-Idf を用いて数値化
    query_vector = tfidf.transform(query_series).toarray() # 入力された質問を Tf-Idf を用いて数値化
    cs = cosine_similarity(question_vector, query_vector) # コサイン類似度の評価
    # print(type(query_vector))
    return cs # それぞれのコサイン類似度を評価

def get_len_series(series): # 質問文書の単語の総量を求める
    tfidf = TfidfVectorizer()
    question_vector = tfidf.fit_transform(series).toarray() #コサイン類似度の評価

    return len(question_vector) # それぞれのコサイン類似度を評価

def find_top_n(n, cs): # コサイン類似度上から順に n 件の配列番号を取得する
    arr_top_n_indices = np.argsort(cs, axis = None)[-n:]
    top_n_indices = arr_top_n_indices[::-1] # 降順にソート

    return top_n_indices # top_n_indices は n 個の配列，一つ一つは番号

def get_n_cs(cs, top_n_index, top_n_indices): # 配列番号 top_n_index 番目のコサイン類似度の取得
    for n_cs in top_n_indices:
        n_cs = cs[top_n_index][0]

    return n_cs

####
# 今困ってるのはコサイン類似度0より大きいものの文書に出てくる単語を抽出し表示，その単語を選択させるプログラムが出来ない
# 1. 単語の抽出
# 2. 単語の表示
# 3. 単語の選択（単語自体の入力 もしくは 番号で選択させる）
# 4. 文書を1つに絞るまでやる

# 同じ内容の文書（似たような文書）がある．その辺の扱いは一旦保留


# def get_cs_words(query_series, series, texts):
def get_new_words(cs, texts): # コサイン類似度の高い質問文書から，名詞と固有名詞のみを抽出する．
    new_texts = [] # 配列
    new_series = {} # pandas の series の形式
    # print(type(cs.nonzero()))
    new_nums = cs.nonzero()[0] # コサイン類似度の行列の1行目だけ欲しい
    for new_num in new_nums:
        # print(texts[new_num])
        new_texts.append(texts[new_num]) # 配列に要素を追加
        # print(new_texts)
    #     # for new_text in new_texts: #不要
    #     #     new_text = texts[new_num]
    #     #     print(new_text)
    #     #     new_texts.append(new_text)
    #     # new_texts = texts[new_nums[0]]
        # new_list = listing_query(texts[new_num])
    new_texts_pd = pd.Series(new_texts) # pandas の series に new_texts を格納
    # new_series = all.preprocess(new_texts_pd, ['名詞', '固有名詞']) # (消さない)対話型でやるときはこっち．all.〜入り
    new_series = preprocess(new_texts_pd, ['名詞', '固有名詞']) # 名詞と固有名詞だけを抽出
    # # print(new_texts)
    # print(new_series)
    print(type(new_series))
    return new_series

    # new_q_series = pd.Series(texts[top_n_indices])

    # tfidf = TfidfVectorizer() # Tf-Idf 化関数に名前を付ける
    # question_vector = tfidf.fit_transform(series).toarray() # 質問文書を Tf-Idf を用いて数値化
    # query_vector = tfidf.transform(query_series).toarray() # 入力された質問を Tf-Idf を用いて数値化
    # # print(query_vector)
    # # cs_nonzero = query_vector.nonzero()
    # # nonzero_indices = np.argwhere(cs != 0)
    # # for nonzero_index in nonzero_indices:
    # #      # print(texts[nonzero_index])
    # #      print(nonzero_index)
    # # return series[nonzero_index]
    # nonzero_indices = tfidf.inverse_transform(query_vector)
    # print(nonzero_indices)
    # n_cs
    # return nonzero_indices

# やること
# 3. 単語の選択（単語自体の入力 もしくは 番号で選択させる）
# 4. 文書を1つに絞るまでやる

def select_topic(new_series): # 類似の質問文書を絞るために series から単語を選ばせる
    new_series_value = []
    new_series_values = []
    new_series_values = new_series.str.split(' ') # 半角スペースで区切る
    new_series_value = sum(new_series_values, []) # 単語の出現回数を数えるために，2次元配列を1次元にした

    # print(new_series_values)

    new_series_count = collections.Counter(new_series_value) # 単語の出現回数を出力
    # print(type(new_series_count))
    return new_series_count

def new_input(): # 提案したワードを入力させる
    new_input = input('上の中から近いワードを選んでください．複数選択する場合は半角スペースで区切って入力してください．: ')
    # new_query = new_input.split(' ')
    # new_query = new_input
    # new_query_pd = pd.Series(new_input)
    # return new_query_pd
    return new_input

# def new_question(new_query_pd, str_query):
def new_str_query(new_input, str_query): # str_query に提案したデータを付与したもの
    str_query_str = str_query[0] # pandas の1行目データ抜き出す．str_query 自体を str 形式にしてしまうと "dtype:object" も入ってしまうため．
    str_query_str = str_query_str + ' ' + new_input
    new_str_query = pd.Series(str_query_str)
    # new_str_query_pd_str = str_query.str + new_query_pd.str
    return new_str_query # 新しい情報を付加したクエリ

def new_question_answer(new_str_query, processed_new_texts_series, n): # 新しいクエリと1度選ばれた質問文書のコサイン類似度の上位の文書番号集合を取得
    new_cs = get_cs(new_str_query, processed_new_texts_series) # 新しいクエリと1度選ばれた質問文書のコサイン類似度の取得
    new_top_n_indices = find_top_n(n, new_cs) # コサイン類似度の高い順に配列番号を並び替える
    return new_top_n_indices # コサイン類似度の配列番号を出力

def listing_query(query): # 質問文書を queries に格納
    list_query = []
    list_query.append(query) # 配列 queries に格納

    return list_query

## test7-2.py
import argparse
import numpy as np
import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from IPython.display import display

import pandas as pd

import MeCab
tagger = MeCab.Tagger("-Ochasen")
import mojimoji
import os
import urllib


text_paths = glob.glob('data/ocu2/*.txt')
from func import all
# from func import q_get
# from func import a_get
# from func import load_stopwords
# from func import preprocess
# from func import get_cs
# from func import find_top_n
# from func import get_n_cs
# from func import listing_query

def main(args):
    texts = all.q_get(text_paths)
    a_texts = all.a_get(text_paths)
    query_texts = all.listing_query(args.query)
    # query_texts = all.listing_query("VPNが繋がらない")
    q_series = pd.Series(texts)
    query_series = pd.Series(query_texts)
    processed_q_series = all.preprocess(q_series)
    str_data = processed_q_series
    processed_query_series = all.preprocess(query_series)
    # str_query = map(str, processed_query_series)
    str_query = processed_query_series
    cs = all.get_cs(str_query, str_data)
    # n = 295
    n = all.get_len_series(str_data) # データセット文書数
    top_n_indices = all.find_top_n(n, cs) # コサイン類似度の取得
    max_index = np.argmax(cs)
    max_cs = cs[max_index][0]
    if max_cs > 1e-10:
        print(f"該当する質問番号: {top_n_indices}")
        # print(f"{lst_top_n}") # np.array 形式から通常の list 形式に変換したものを表示
        # print(f"配列個数:{len(cs)}, コサイン類似度: {max_cs}, 配列番号: {max_index}, 類似度最大単語: '{max_data}'")
        for top_n_index in top_n_indices: # 結果の表示
            n_cs = all.get_n_cs(cs, top_n_index, top_n_indices) # 各コサイン類似度の取得
            if n_cs > 1e-10:
                print(f"質問データ #{top_n_index}, コサイン類似度: {n_cs}, '{texts[top_n_index]}'")
    else:
        print("NotFound")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("query", type=str)
    args = parser.parse_args()
    main(args)

## test8.py
import importlib
# importlib.reload()
import argparse
import numpy as np
import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from IPython.display import display

import pandas as pd

import MeCab
tagger = MeCab.Tagger("-Ochasen")
import mojimoji
import os
import urllib

from collections import Counter
import collections

text_paths = glob.glob('data/ocu2/*.txt')
from func import all
# from func import q_get
# from func import a_get
# from func import load_stopwords
# from func import preprocess
# from func import get_cs
# from func import find_top_n
# from func import get_n_cs
# from func import listing_query

def main(args):
    texts = all.q_get(text_paths)
    a_texts = all.a_get(text_paths)
    query_texts = all.listing_query(args.query)
    # query_texts = all.listing_query("VPNが繋がらない")
    q_series = pd.Series(texts)
    query_series = pd.Series(query_texts)
    processed_q_series = all.preprocess(q_series, ['名詞', '固有名詞', '動詞', '形容詞'])
    str_data = processed_q_series
    processed_query_series = all.preprocess(query_series, ['名詞', '固有名詞', '動詞', '形容詞'])
    str_query = processed_query_series
    cs = all.get_cs(str_query, str_data)
    # nonzero_indices = all.get_cs_words(str_query, str_data, cs)
    # for nonzero_index in nonzero_indices:
        # print(q_series[nonzero_index])
    # n = 295
    n = all.get_len_series(str_data) # データセット文書数
    top_n_indices = all.find_top_n(n, cs) # コサイン類似度の取得

    #
    new_series = all.get_new_words(cs, texts)
    # print(new_series)

    # print(new_texts)
    max_index = np.argmax(cs)
    max_cs = cs[max_index][0]

    new_texts =[]

    if max_cs > 1e-10:
        # print(f"該当する質問番号: {top_n_indices}")
        # print(f"{lst_top_n}") # np.array 形式から通常の list 形式に変換したものを表示
        # print(f"配列個数:{len(cs)}, コサイン類似度: {max_cs}, 配列番号: {max_index}, 類似度最大単語: '{max_data}'")
        for top_n_index in top_n_indices: # 結果の表示
            n_cs = all.get_n_cs(cs, top_n_index, top_n_indices) # 各コサイン類似度の取得
            if n_cs > 1e-10:
                print(f"質問データ #{top_n_index}, コサイン類似度: {n_cs}, '{texts[top_n_index]}'")
                new_text = texts[top_n_index]
                new_texts.append(new_text) # 提示されているデータを新しい質問文書リストに格納

        # print(new_texts)
        new_series_count = all.select_topic(new_series) # 言葉の出現回数の取得
        print(new_series_count) # 言葉の出現回数の表示

        new_texts_series = pd.Series(new_texts) # 1度選ばれた質問文書を pandas のシリーズにする
        processed_new_texts_series = all.preprocess(new_texts_series, ['名詞', '固有名詞', '動詞', '形容詞']) # 1度選ばれた質問文書を処理し，名詞，固有名詞，動詞，形容詞のみにする
        new_input = all.new_input() # 提案されたワードを入力させる
        new_str_query = all.new_str_query(new_input, str_query) # はじめに入力されたクエリと後で追加されたクエリを合併させる
        new_top_n_indices = all.new_question_answer(new_str_query, processed_new_texts_series, n) # 質問文書をクエリと提示された文書とのコサイン類似度を大きい順に並べる．

        print(new_top_n_indices)
        new_cs = all.get_cs(new_str_query, processed_new_texts_series) # 新しいクエリと新しい質問文書とのコサイン類似度の取得
        for new_top_n_index in new_top_n_indices: # 結果の表示
            new_n_cs = all.get_n_cs(new_cs, new_top_n_index, new_top_n_indices) # n 番目のコサイン類似度の取得
            if new_n_cs > 0.4:
                print(f"質問データ #{new_top_n_index}, コサイン類似度: {new_n_cs}, '{new_texts[new_top_n_index]}'")
                # print(f"#{new_top_n_index} コサイン類似度: {new_n_cs}")
                break
            if new_n_cs <= 0.4:
                print("検索ワードを入力し直してください")
                break


    else:
        print("NotFound")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("query", type=str)
    args = parser.parse_args()
    main(args)
	import pandas as pd
	import MeCab
	tagger = MeCab.Tagger("-Ochasen")
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import numpy as np

	from collections import Counter
	import collections
	import itertools

	def q_get(text_paths): # 質問文書を text に格納
	texts = []
	for text_path in text_paths:
	text = open(text_path, 'r').read()
	text = text.split(',') # CSV ファイルのセルで分割
	text = ' '.join(text[8:9]) # 質問文書部分
	text = text.replace( '\n' , '' ) # 質問文書の改行を削除
	text = text.strip('"') # CSV ファイルのセル " を削除
	# text = text.replace('する', '') # するできるの削除（不要？）
	# text = text.replace('できる', '')
	texts.append(text) # 配列 texts に格納

	return texts

	def a_get(text_paths): # 回答文書を text に格納
	a_texts = []
	for text_path in text_paths:
	a_text = open(text_path, 'r').read()
	a_text = a_text.split(',') # CSV ファイルのセルで分割
	a_text = ' '.join(a_text[16:17]) # 質問文書部分
	# a_text = a_text.replace( '\n' , '' ) # 質問文書の改行を削除．読みにくいのでやっぱり不要
	a_text = a_text.strip('"') # CSV ファイルのセル " を削除
	a_texts.append(a_text) # 配列 a_texts に格納

	return a_texts

	def load_stopwords(path="data/jp_stop_words.txt"): # ストップワードの読み込み外部ファイル
	url = 'http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt'
	# if os.path.exists(path):
	# print('ストップワードの読み込み完了')
	# else:
	# print('ストップワードのダウンロード中')
	# urllib.request.urlretrieve(url, path)
	return pd.read_csv(path, header=None)[0].tolist()

	def preprocess(series, flags = ['名詞', '固有名詞', '動詞', '形容詞']): # 前処理
	stop_words = load_stopwords() # ストップワードの削除
	def tokenizer_func(text): # MeCab で名詞，動詞，形容動詞のみを残す処理する部分
	tokens = []
	node = tagger.parseToNode(str(text))
	while node:
	features = node.feature.split(',') # MeCab 辞書はコンマ区切りなので，コンマで分割
	surface = features[6] # MeCab 辞書の6番目の言葉の原型を抽出
	if (surface == '*') or (len(surface) < 2) or (surface in stop_words): # 知らない言葉は無視
	node = node.next
	continue

	if (features[0] == '名詞') & ('名詞' in flags): # MeCab での名詞かつ名詞という flag の付いたものを抽出
	tokens.append(surface)
	elif ((features[0] == '名詞') & (features[1] == '固有名詞')) & ('固有名詞' in flags): # MeCab での固有名詞かつ固有名詞という flag の付いたものを抽出
	tokens.append(surface)
	elif ((features[0] == '動詞') & (features[1] == '自立')) & ('動詞' in flags): # MeCab での動詞（自立）かつ動詞という flag の付いたものを抽出
	tokens.append(surface)
	elif ((features[0] == '形容詞') & (features[1] == '自立')) & ('形容詞' in flags): # MeCab での形容詞かつ形容詞という flag の付いたものを抽出
	tokens.append(surface)

	# noun_flag = (features[0] == '名詞')
	# proper_noun_flag = (features[0] == '名詞') & (features[1] == '固有名詞')
	# verb_flag = (features[0] == '動詞') & (features[1] == '自立')
	# adjective_flag = (features[0] == '形容詞') & (features[1] == '自立')
	# if proper_noun_flag:
	# tokens.append(surface)
	# elif noun_flag:
	# tokens.append(surface)
	# elif verb_flag:
	# tokens.append(surface)
	# elif adjective_flag:
	# tokens.append(surface)

	node = node.next
	return " ".join(tokens)

	series = series.map(tokenizer_func)

	#---------------Normalization-----------#
	series = series.map(lambda x: x.lower()) # 小文字に統一
	# series = series.map(mojimoji.zen_to_han, kana=False) # 半角に（カタカナ除く）統一．なんか動かないし不要．

	return series

	# query_preprocess は不要．
	# def query_preprocess(query_series): # 前処理
	# stop_words = load_stopwords() # ストップワードの削除
	# def tokenizer_func(text): # MeCab で名詞，動詞，形容動詞のみを残す処理する部分
	# tokens = []
	# node = tagger.parseToNode(str(text))
	# while node:
	# features = node.feature.split(',') # MeCab 辞書はコンマ区切りなので，コンマで分割
	# surface = features[6] # MeCab 辞書の6番目の言葉の原型を抽出
	# if (surface == '*') or (len(surface) < 2) or (surface in stop_words): # 知らない言葉は無視
	# node = node.next
	# continue
	# noun_flag = (features[0] == '名詞')
	# proper_noun_flag = (features[0] == '名詞') & (features[1] == '固有名詞')
	# verb_flag = (features[0] == '動詞') & (features[1] == '自立')
	# adjective_flag = (features[0] == '形容詞') & (features[1] == '自立')
	# if proper_noun_flag:
	# tokens.append(surface)
	# elif noun_flag:
	# tokens.append(surface)
	# elif verb_flag:
	# tokens.append(surface)
	# elif adjective_flag:
	# tokens.append(surface)
	# node = node.next
	# return " ".join(tokens)
	#
	# query_series = query_series.map(tokenizer_func)
	# # query_series = tokenizer_func(query_series)
	#
	# #---------------Normalization-----------#
	# query_series = query_series.map(lambda x: x.lower()) # 小文字に統一
	# # series = series.map(mojimoji.zen_to_han, kana=False) # 半角に（カタカナ除く）統一．なんか動かないし不要．
	# return query_series

	def question_vector(series): # 質問文書を Tf-Idf を用いて数値化
	tfidf = TfidfVectorizer()
	question_vector = tfidf.fit_transform(series).toarray() #コサイン類似度の評価
	query_vector = tfidf.transform(query_series).toarray() # 入力された質問を Tf-Idf を用いて数値化
	# 複数の返り値 https://pg-chain.com/python-function-return#toc3
	return question_vector, query_vector

	def get_cs(query_series, series): # 質問文書を MeCab で処理したあとのものをコサイン類似度を評価．
	tfidf = TfidfVectorizer() # Tf-Idf 化関数に名前を付ける
	question_vector = tfidf.fit_transform(series).toarray() # 質問文書を Tf-Idf を用いて数値化
	query_vector = tfidf.transform(query_series).toarray() # 入力された質問を Tf-Idf を用いて数値化
	cs = cosine_similarity(question_vector, query_vector) # コサイン類似度の評価
	# print(type(query_vector))
	return cs # それぞれのコサイン類似度を評価

	def get_len_series(series): # 質問文書の単語の総量を求める
	tfidf = TfidfVectorizer()
	question_vector = tfidf.fit_transform(series).toarray() #コサイン類似度の評価

	return len(question_vector) # それぞれのコサイン類似度を評価

	def find_top_n(n, cs): # コサイン類似度上から順に n 件の配列番号を取得する
	arr_top_n_indices = np.argsort(cs, axis = None)[-n:]
	top_n_indices = arr_top_n_indices[::-1] # 降順にソート

	return top_n_indices # top_n_indices は n 個の配列，一つ一つは番号

	def get_n_cs(cs, top_n_index, top_n_indices): # 配列番号 top_n_index 番目のコサイン類似度の取得
	for n_cs in top_n_indices:
	n_cs = cs[top_n_index][0]

	return n_cs

	####
	# 今困ってるのはコサイン類似度0より大きいものの文書に出てくる単語を抽出し表示，その単語を選択させるプログラムが出来ない
	# 1. 単語の抽出
	# 2. 単語の表示
	# 3. 単語の選択（単語自体の入力もしくは番号で選択させる）
	# 4. 文書を1つに絞るまでやる

	# 同じ内容の文書（似たような文書）がある．その辺の扱いは一旦保留


	# def get_cs_words(query_series, series, texts):
	def get_new_words(cs, texts): # コサイン類似度の高い質問文書から，名詞と固有名詞のみを抽出する．
	new_texts = [] # 配列
	new_series = {} # pandas の series の形式
	# print(type(cs.nonzero()))
	new_nums = cs.nonzero()[0] # コサイン類似度の行列の1行目だけ欲しい
	for new_num in new_nums:
	# print(texts[new_num])
	new_texts.append(texts[new_num]) # 配列に要素を追加
	# print(new_texts)
	# # for new_text in new_texts: #不要
	# # new_text = texts[new_num]
	# # print(new_text)
	# # new_texts.append(new_text)
	# # new_texts = texts[new_nums[0]]
	# new_list = listing_query(texts[new_num])
	new_texts_pd = pd.Series(new_texts) # pandas の series に new_texts を格納
	# new_series = all.preprocess(new_texts_pd, ['名詞', '固有名詞']) # (消さない)対話型でやるときはこっち．all.〜入り
	new_series = preprocess(new_texts_pd, ['名詞', '固有名詞']) # 名詞と固有名詞だけを抽出
	# # print(new_texts)
	# print(new_series)
	print(type(new_series))
	return new_series

	# new_q_series = pd.Series(texts[top_n_indices])

	# tfidf = TfidfVectorizer() # Tf-Idf 化関数に名前を付ける
	# question_vector = tfidf.fit_transform(series).toarray() # 質問文書を Tf-Idf を用いて数値化
	# query_vector = tfidf.transform(query_series).toarray() # 入力された質問を Tf-Idf を用いて数値化
	# # print(query_vector)
	# # cs_nonzero = query_vector.nonzero()
	# # nonzero_indices = np.argwhere(cs != 0)
	# # for nonzero_index in nonzero_indices:
	# # # print(texts[nonzero_index])
	# # print(nonzero_index)
	# # return series[nonzero_index]
	# nonzero_indices = tfidf.inverse_transform(query_vector)
	# print(nonzero_indices)
	# n_cs
	# return nonzero_indices

	# やること
	# 3. 単語の選択（単語自体の入力もしくは番号で選択させる）
	# 4. 文書を1つに絞るまでやる

	def select_topic(new_series): # 類似の質問文書を絞るために series から単語を選ばせる
	new_series_value = []
	new_series_values = []
	new_series_values = new_series.str.split(' ') # 半角スペースで区切る
	new_series_value = sum(new_series_values, []) # 単語の出現回数を数えるために，2次元配列を1次元にした

	# print(new_series_values)

	new_series_count = collections.Counter(new_series_value) # 単語の出現回数を出力
	# print(type(new_series_count))
	return new_series_count

	def new_input(): # 提案したワードを入力させる
	new_input = input('上の中から近いワードを選んでください．複数選択する場合は半角スペースで区切って入力してください．: ')
	# new_query = new_input.split(' ')
	# new_query = new_input
	# new_query_pd = pd.Series(new_input)
	# return new_query_pd
	return new_input

	# def new_question(new_query_pd, str_query):
	def new_str_query(new_input, str_query): # str_query に提案したデータを付与したもの
	str_query_str = str_query[0] # pandas の1行目データ抜き出す．str_query 自体を str 形式にしてしまうと "dtype:object" も入ってしまうため．
	str_query_str = str_query_str + ' ' + new_input
	new_str_query = pd.Series(str_query_str)
	# new_str_query_pd_str = str_query.str + new_query_pd.str
	return new_str_query # 新しい情報を付加したクエリ

	def new_question_answer(new_str_query, processed_new_texts_series, n): # 新しいクエリと1度選ばれた質問文書のコサイン類似度の上位の文書番号集合を取得
	new_cs = get_cs(new_str_query, processed_new_texts_series) # 新しいクエリと1度選ばれた質問文書のコサイン類似度の取得
	new_top_n_indices = find_top_n(n, new_cs) # コサイン類似度の高い順に配列番号を並び替える
	return new_top_n_indices # コサイン類似度の配列番号を出力

	def listing_query(query): # 質問文書を queries に格納
	list_query = []
	list_query.append(query) # 配列 queries に格納

	return list_query
	import argparse
	import numpy as np
	import glob
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity

	from IPython.display import display

	import pandas as pd

	import MeCab
	tagger = MeCab.Tagger("-Ochasen")
	import mojimoji
	import os
	import urllib


	text_paths = glob.glob('data/ocu2/*.txt')
	from func import all
	# from func import q_get
	# from func import a_get
	# from func import load_stopwords
	# from func import preprocess
	# from func import get_cs
	# from func import find_top_n
	# from func import get_n_cs
	# from func import listing_query

	def main(args):
	texts = all.q_get(text_paths)
	a_texts = all.a_get(text_paths)
	query_texts = all.listing_query(args.query)
	# query_texts = all.listing_query("VPNが繋がらない")
	q_series = pd.Series(texts)
	query_series = pd.Series(query_texts)
	processed_q_series = all.preprocess(q_series)
	str_data = processed_q_series
	processed_query_series = all.preprocess(query_series)
	# str_query = map(str, processed_query_series)
	str_query = processed_query_series
	cs = all.get_cs(str_query, str_data)
	# n = 295
	n = all.get_len_series(str_data) # データセット文書数
	top_n_indices = all.find_top_n(n, cs) # コサイン類似度の取得
	max_index = np.argmax(cs)
	max_cs = cs[max_index][0]
	if max_cs > 1e-10:
	print(f"該当する質問番号: {top_n_indices}")
	# print(f"{lst_top_n}") # np.array 形式から通常の list 形式に変換したものを表示
	# print(f"配列個数:{len(cs)}, コサイン類似度: {max_cs}, 配列番号: {max_index}, 類似度最大単語: '{max_data}'")
	for top_n_index in top_n_indices: # 結果の表示
	n_cs = all.get_n_cs(cs, top_n_index, top_n_indices) # 各コサイン類似度の取得
	if n_cs > 1e-10:
	print(f"質問データ #{top_n_index}, コサイン類似度: {n_cs}, '{texts[top_n_index]}'")
	else:
	print("NotFound")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("query", type=str)
	args = parser.parse_args()
	main(args)
	import importlib
	# importlib.reload()
	import argparse
	import numpy as np
	import glob
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity

	from IPython.display import display

	import pandas as pd

	import MeCab
	tagger = MeCab.Tagger("-Ochasen")
	import mojimoji
	import os
	import urllib

	from collections import Counter
	import collections

	text_paths = glob.glob('data/ocu2/*.txt')
	from func import all
	# from func import q_get
	# from func import a_get
	# from func import load_stopwords
	# from func import preprocess
	# from func import get_cs
	# from func import find_top_n
	# from func import get_n_cs
	# from func import listing_query

	def main(args):
	texts = all.q_get(text_paths)
	a_texts = all.a_get(text_paths)
	query_texts = all.listing_query(args.query)
	# query_texts = all.listing_query("VPNが繋がらない")
	q_series = pd.Series(texts)
	query_series = pd.Series(query_texts)
	processed_q_series = all.preprocess(q_series, ['名詞', '固有名詞', '動詞', '形容詞'])
	str_data = processed_q_series
	processed_query_series = all.preprocess(query_series, ['名詞', '固有名詞', '動詞', '形容詞'])
	str_query = processed_query_series
	cs = all.get_cs(str_query, str_data)
	# nonzero_indices = all.get_cs_words(str_query, str_data, cs)
	# for nonzero_index in nonzero_indices:
	# print(q_series[nonzero_index])
	# n = 295
	n = all.get_len_series(str_data) # データセット文書数
	top_n_indices = all.find_top_n(n, cs) # コサイン類似度の取得

	#
	new_series = all.get_new_words(cs, texts)
	# print(new_series)

	# print(new_texts)
	max_index = np.argmax(cs)
	max_cs = cs[max_index][0]

	new_texts =[]

	if max_cs > 1e-10:
	# print(f"該当する質問番号: {top_n_indices}")
	# print(f"{lst_top_n}") # np.array 形式から通常の list 形式に変換したものを表示
	# print(f"配列個数:{len(cs)}, コサイン類似度: {max_cs}, 配列番号: {max_index}, 類似度最大単語: '{max_data}'")
	for top_n_index in top_n_indices: # 結果の表示
	n_cs = all.get_n_cs(cs, top_n_index, top_n_indices) # 各コサイン類似度の取得
	if n_cs > 1e-10:
	print(f"質問データ #{top_n_index}, コサイン類似度: {n_cs}, '{texts[top_n_index]}'")
	new_text = texts[top_n_index]
	new_texts.append(new_text) # 提示されているデータを新しい質問文書リストに格納

	# print(new_texts)
	new_series_count = all.select_topic(new_series) # 言葉の出現回数の取得
	print(new_series_count) # 言葉の出現回数の表示

	new_texts_series = pd.Series(new_texts) # 1度選ばれた質問文書を pandas のシリーズにする
	processed_new_texts_series = all.preprocess(new_texts_series, ['名詞', '固有名詞', '動詞', '形容詞']) # 1度選ばれた質問文書を処理し，名詞，固有名詞，動詞，形容詞のみにする
	new_input = all.new_input() # 提案されたワードを入力させる
	new_str_query = all.new_str_query(new_input, str_query) # はじめに入力されたクエリと後で追加されたクエリを合併させる
	new_top_n_indices = all.new_question_answer(new_str_query, processed_new_texts_series, n) # 質問文書をクエリと提示された文書とのコサイン類似度を大きい順に並べる．

	print(new_top_n_indices)
	new_cs = all.get_cs(new_str_query, processed_new_texts_series) # 新しいクエリと新しい質問文書とのコサイン類似度の取得
	for new_top_n_index in new_top_n_indices: # 結果の表示
	new_n_cs = all.get_n_cs(new_cs, new_top_n_index, new_top_n_indices) # n 番目のコサイン類似度の取得
	if new_n_cs > 0.4:
	print(f"質問データ #{new_top_n_index}, コサイン類似度: {new_n_cs}, '{new_texts[new_top_n_index]}'")
	# print(f"#{new_top_n_index} コサイン類似度: {new_n_cs}")
	break
	if new_n_cs <= 0.4:
	print("検索ワードを入力し直してください")
	break


	else:
	print("NotFound")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("query", type=str)
	args = parser.parse_args()
	main(args)