uhfx/1-bachelor-thesis-programs.md

## 1-bachelor-thesis-programs.md

      
    Raw
  

              1-bachelor-thesis-programs.md
            
          
    各種プログラム最終版

  
## all.py
import pandas as pd
import MeCab
tagger = MeCab.Tagger("-Ochasen")
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

from collections import Counter
import collections
import itertools

def q_get(text_paths): # 質問文書を text に格納
    texts = []
    for text_path in text_paths:
        text = open(text_path, 'r').read()
        text = text.split(',') # CSV ファイルのセルで分割
        text = ' '.join(text[8:9]) # 質問文書部分
        text = text.replace( '\n' , '' ) # 質問文書の改行を削除
        text = text.strip('"') # CSV ファイルのセル " を削除
        # text = text.replace('する', '') # する できる の削除（不要？）
        # text = text.replace('できる', '')
        texts.append(text) # 配列 texts に格納

    return texts

def a_get(text_paths): # 回答文書を text に格納
    a_texts = []
    for text_path in text_paths:
        a_text = open(text_path, 'r').read()
        a_text = a_text.split(',') # CSV ファイルのセルで分割
        a_text = ' '.join(a_text[16:17]) # 質問文書部分
        # a_text = a_text.replace( '\n' , '' ) # 質問文書の改行を削除．読みにくいのでやっぱり不要
        a_text = a_text.strip('"') # CSV ファイルのセル " を削除
        a_texts.append(a_text) # 配列 a_texts に格納

    return a_texts

def load_stopwords(path="data/jp_stop_words.txt"): # ストップワードの読み込み 外部ファイル
    url = 'http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt'
    # if os.path.exists(path):
    #     print('ストップワードの読み込み完了')
    # else:
    #     print('ストップワードのダウンロード中')
    #     urllib.request.urlretrieve(url, path)
    return pd.read_csv(path, header=None)[0].tolist()

# def preprocess(series, flags = ['名詞', '固有名詞', '動詞', '形容詞']): # 前処理
#     stop_words = load_stopwords() # ストップワードの削除
#     def tokenizer_func(text): # MeCab で名詞，動詞，形容動詞のみを残す処理する部分
#         tokens = []
#         node = tagger.parseToNode(str(text))
#         while node:
#             features = node.feature.split(',') # MeCab 辞書はコンマ区切りなので，コンマで分割
#             surface = features[6] # MeCab 辞書の6番目の言葉の原型を抽出
#             if (surface == '*') or (len(surface) < 2) or (surface in stop_words): # 知らない言葉は無視
#                 node = node.next
#                 continue
#
#             if (features[0] == '名詞') & ('名詞' in flags): # MeCab での名詞かつ名詞という flag の付いたものを抽出
#                 tokens.append(surface)
#             elif ((features[0] == '名詞') & (features[1] == '固有名詞')) & ('固有名詞' in flags): # MeCab での固有名詞かつ固有名詞という flag の付いたものを抽出
#                 tokens.append(surface)
#             elif ((features[0] == '動詞') & (features[1] == '自立')) & ('動詞' in flags): # MeCab での動詞（自立）かつ動詞という flag の付いたものを抽出
#                 tokens.append(surface)
#             elif ((features[0] == '形容詞') & (features[1] == '自立')) & ('形容詞' in flags): # MeCab での形容詞かつ形容詞という flag の付いたものを抽出
#                 tokens.append(surface)
#
#             # noun_flag = (features[0] == '名詞')
#             # proper_noun_flag = (features[0] == '名詞') & (features[1] == '固有名詞')
#             # verb_flag = (features[0] == '動詞') & (features[1] == '自立')
#             # adjective_flag = (features[0] == '形容詞') & (features[1] == '自立')
#             # if proper_noun_flag:
#             #     tokens.append(surface)
#             # elif noun_flag:
#             #     tokens.append(surface)
#             # elif verb_flag:
#             #     tokens.append(surface)
#             # elif adjective_flag:
#             #     tokens.append(surface)
#
#             node = node.next
#         return " ".join(tokens)
#
#     series = series.map(tokenizer_func)
#
#     #---------------Normalization-----------#
#     series = series.map(lambda x: x.lower()) # 小文字に統一
#     # series = series.map(mojimoji.zen_to_han, kana=False) # 半角に（カタカナ除く）統一．なんか動かないし不要．
#
#     return series
def preprocess(series, flags = ['名詞', '固有名詞', '動詞', '形容詞']): # 前処理
    stop_words = load_stopwords() # ストップワードの削除
    def tokenizer_func(text): # MeCab で名詞，動詞，形容動詞のみを残す処理する部分
        tokens = []
        node = tagger.parseToNode(str(text))
        while node:
            features = node.feature.split(',') # MeCab 辞書はコンマ区切りなので，コンマで分割
            surface = features[6] # MeCab 辞書の6番目の言葉の原型を抽出
            # if surface == '*': # 知らない言葉を表示
                # print(node.surface)
            if (surface == '*') or (len(surface) < 2) or (surface in stop_words): # 知らない言葉は無視
                node = node.next
                continue
            elif (features[0] == '名詞') & ('名詞' in flags): # MeCab での名詞かつ名詞という flag の付いたものを抽出
                tokens.append(surface)
            elif ((features[0] == '名詞') & (features[1] == '固有名詞')) & ('固有名詞' in flags): # MeCab での固有名詞かつ固有名詞という flag の付いたものを抽出
                tokens.append(surface)
            elif ((features[0] == '動詞') & (features[1] == '自立')) & ('動詞' in flags): # MeCab での動詞（自立）かつ動詞という flag の付いたものを抽出
                tokens.append(surface)
            elif ((features[0] == '形容詞') & (features[1] == '自立')) & ('形容詞' in flags): # MeCab での形容詞かつ形容詞という flag の付いたものを抽出
                tokens.append(surface)

            # noun_flag = (features[0] == '名詞')
            # proper_noun_flag = (features[0] == '名詞') & (features[1] == '固有名詞')
            # verb_flag = (features[0] == '動詞') & (features[1] == '自立')
            # adjective_flag = (features[0] == '形容詞') & (features[1] == '自立')
            # if proper_noun_flag:
            #     tokens.append(surface)
            # elif noun_flag:
            #     tokens.append(surface)
            # elif verb_flag:
            #     tokens.append(surface)
            # elif adjective_flag:
            #     tokens.append(surface)

            node = node.next
        return " ".join(tokens)

    series = series.map(tokenizer_func)

    #---------------Normalization-----------#
    series = series.map(lambda x: x.lower()) # 小文字に統一
    # series = series.map(mojimoji.zen_to_han, kana=False) # 半角に（カタカナ除く）統一．なんか動かないし不要．

    return series
# query_preprocess は不要．
# def query_preprocess(query_series): # 前処理
#     stop_words = load_stopwords() # ストップワードの削除
#     def tokenizer_func(text): # MeCab で名詞，動詞，形容動詞のみを残す処理する部分
#         tokens = []
#         node = tagger.parseToNode(str(text))
#         while node:
#             features = node.feature.split(',') # MeCab 辞書はコンマ区切りなので，コンマで分割
#             surface = features[6] # MeCab 辞書の6番目の言葉の原型を抽出
#             if (surface == '*') or (len(surface) < 2) or (surface in stop_words): # 知らない言葉は無視
#                 node = node.next
#                 continue
#             noun_flag = (features[0] == '名詞')
#             proper_noun_flag = (features[0] == '名詞') & (features[1] == '固有名詞')
#             verb_flag = (features[0] == '動詞') & (features[1] == '自立')
#             adjective_flag = (features[0] == '形容詞') & (features[1] == '自立')
#             if proper_noun_flag:
#                 tokens.append(surface)
#             elif noun_flag:
#                 tokens.append(surface)
#             elif verb_flag:
#                 tokens.append(surface)
#             elif adjective_flag:
#                 tokens.append(surface)
#             node = node.next
#         return " ".join(tokens)
#
#     query_series = query_series.map(tokenizer_func)
#     # query_series = tokenizer_func(query_series)
#
#     #---------------Normalization-----------#
#     query_series = query_series.map(lambda x: x.lower()) # 小文字に統一
#     # series = series.map(mojimoji.zen_to_han, kana=False) # 半角に（カタカナ除く）統一．なんか動かないし不要．
#     return query_series

def question_vector(series): # 質問文書を Tf-Idf を用いて数値化
    tfidf = TfidfVectorizer()
    question_vector = tfidf.fit_transform(series).toarray() #コサイン類似度の評価
    query_vector = tfidf.transform(query_series).toarray() # 入力された質問を Tf-Idf を用いて数値化
    # 複数の返り値 https://pg-chain.com/python-function-return#toc3
    return question_vector, query_vector

def get_cs(query_series, series): # 質問文書を MeCab で処理したあとのものをコサイン類似度を評価．
    tfidf = TfidfVectorizer() # Tf-Idf 化関数に名前を付ける
    question_vector = tfidf.fit_transform(series).toarray() # 質問文書を Tf-Idf を用いて数値化
    # print(len(question_vector[0]))
    query_vector = tfidf.transform(query_series).toarray() # 入力された質問を Tf-Idf を用いて数値化
    cs = cosine_similarity(question_vector, query_vector) # コサイン類似度の評価
    # print(len(query_vector[0]))
    return cs # それぞれのコサイン類似度を評価

def get_len_series(series): # 質問文書の単語の総量を求める
    tfidf = TfidfVectorizer()
    question_vector = tfidf.fit_transform(series).toarray() #コサイン類似度の評価

    return len(question_vector) # それぞれのコサイン類似度を評価

def find_top_n(n, cs): # コサイン類似度上から順に n 件の配列番号を取得する
    arr_top_n_indices = np.argsort(cs, axis = None)[-n:]
    top_n_indices = arr_top_n_indices[::-1] # 降順にソート
    return top_n_indices # top_n_indices は n 個の配列，一つ一つは番号

def get_n_cs(cs, top_n_index, top_n_indices): # 配列番号 top_n_index 番目のコサイン類似度の取得
    for n_cs in top_n_indices:
        n_cs = cs[top_n_index][0]
    return n_cs

####
# 今困ってるのはコサイン類似度0より大きいものの文書に出てくる単語を抽出し表示，その単語を選択させるプログラムが出来ない
# 1. 単語の抽出
# 2. 単語の表示
# 3. 単語の選択（単語自体の入力 もしくは 番号で選択させる）
# 4. 文書を1つに絞るまでやる

# 同じ内容の文書（似たような文書）がある．その辺の扱いは一旦保留


# def get_cs_words(query_series, series, texts):
def get_new_words(cs, texts): # コサイン類似度の高い質問文書から，名詞と固有名詞のみを抽出する．
    new_texts = [] # 配列
    new_series = {} # pandas の series の形式
    # print(type(cs.nonzero()))
    new_nums = cs.nonzero()[0] # コサイン類似度の行列の1行目だけ欲しい
    for new_num in new_nums:
        # print(texts[new_num])
        new_texts.append(texts[new_num]) # 配列に要素を追加
        # print(new_texts)
    #     # for new_text in new_texts: #不要
    #     #     new_text = texts[new_num]
    #     #     print(new_text)
    #     #     new_texts.append(new_text)
    #     # new_texts = texts[new_nums[0]]
        # new_list = listing_query(texts[new_num])
    new_texts_pd = pd.Series(new_texts) # pandas の series に new_texts を格納
    # new_series = all.preprocess(new_texts_pd, ['名詞', '固有名詞']) # (消さない)対話型でやるときはこっち．all.〜入り
    new_series = preprocess(new_texts_pd, ['名詞', '固有名詞']) # 名詞と固有名詞だけを抽出
    # # print(new_texts)
    # print(new_series)
    print(type(new_series))
    return new_series

    # new_q_series = pd.Series(texts[top_n_indices])

    # tfidf = TfidfVectorizer() # Tf-Idf 化関数に名前を付ける
    # question_vector = tfidf.fit_transform(series).toarray() # 質問文書を Tf-Idf を用いて数値化
    # query_vector = tfidf.transform(query_series).toarray() # 入力された質問を Tf-Idf を用いて数値化
    # # print(query_vector)
    # # cs_nonzero = query_vector.nonzero()
    # # nonzero_indices = np.argwhere(cs != 0)
    # # for nonzero_index in nonzero_indices:
    # #      # print(texts[nonzero_index])
    # #      print(nonzero_index)
    # # return series[nonzero_index]
    # nonzero_indices = tfidf.inverse_transform(query_vector)
    # print(nonzero_indices)
    # n_cs
    # return nonzero_indices

# やること
# 3. 単語の選択（単語自体の入力 もしくは 番号で選択させる）
# 4. 文書を1つに絞るまでやる

def select_topic(new_series): # 類似の質問文書を絞るために series から単語を選ばせる
    new_series_value = []
    new_series_values = []
    new_series_values = new_series.str.split(' ') # 半角スペースで区切る
    new_series_value = sum(new_series_values, []) # 単語の出現回数を数えるために，2次元配列を1次元にした

    # print(new_series_values)

    new_series_count = collections.Counter(new_series_value) # 単語の出現回数を出力

    print(f"{len(new_series_count)}件のワードが見つかりました")
    print(f"{set(new_series_count)}")
    return new_series_count

def new_input(): # 提案したワードを入力させる
    new_input_raw = input('上の中から近いワードを選んでください．複数選択する場合は半角スペースで区切って入力してください．: ') # 入力を小文字にする
    # new_query = new_input.split(' ')
    # new_query = new_input
    new_input_pd = pd.Series(new_input_raw)
    new_input = preprocess(new_input_pd, ['名詞', '固有名詞', '動詞', '形容詞'])
    return new_input

# def new_question(new_query_pd, str_query):
def new_str_query(new_input, str_query): # str_query に新たなクエリを付与したもの
    str_query_str = str_query[0] # pandas の1行目データ抜き出す．str_query 自体を str 形式にしてしまうと "dtype:object" も入ってしまうため．
    str_query_str = str_query_str + ' ' + new_input
    # print(str_query_str)
    # pd series これを処理
    new_str_query = pd.Series(str_query_str)
    # print(new_str_query)

    # new_str_query_raw = pd.Series(str_query_str)
    # new_str_query_pd_str = str_query.str + new_query_pd.str
    # new_str_query = all.preprocess(new_str_query_raw, ['名詞', '固有名詞', '動詞', '形容詞'])
    # print(new_str_query)
    return new_str_query # 新しい情報を付加したクエリ

def new_question_answer(new_str_query, processed_new_texts_series, n): # 新しいクエリと1度選ばれた質問文書のコサイン類似度の上位の文書番号集合を取得
    new_cs = get_cs(new_str_query, processed_new_texts_series) # 新しいクエリと1度選ばれた質問文書のコサイン類似度の取得
    new_top_n_indices = find_top_n(n, new_cs) # コサイン類似度の高い順に配列番号を並び替える
    return new_top_n_indices # コサイン類似度の配列番号を出力

def cs_selector(new_top_n_indices, new_cs, new_texts, new_cs_max, cs_value):
    counter = 0
    print(f"コサイン類似度 {cs_value} 以上の文書は以下の通りです。")
    for new_top_n_index in new_top_n_indices: # 結果の表示
        new_n_cs = get_n_cs(new_cs, new_top_n_index, new_top_n_indices) # n 個のコサイン類似度の取得．<class 'numpy.float64'>
        if new_n_cs > cs_value:
            print(f"質問データ #{new_top_n_index}, コサイン類似度: {new_n_cs}, '{new_texts[new_top_n_index]}'")
        #     # print(f"#{new_top_n_index} コサイン類似度: {new_n_cs}")
            counter = counter + 1
            continue
        if new_n_cs <= cs_value:
            if counter == 0:
                print(f"コサイン類似度 {cs_value} 以上の質問文書はありません。検索ワードを変えてやり直してください。")
                break
            else:
                print(f"コサイン類似度 {cs_value} 以上の質問文書は {counter} 件です。\n")
                break


# def cs_selector(top_n_indices, new_cs, new_texts, new_cs_max, cs_value):
#     counter = 0
#     print(f"コサイン類似度 {cs_value} 以上の文書は以下の通りです。")
#     for new_top_n_index in top_n_indices: # 結果の表示
#         new_n_cs = get_n_cs(new_cs, new_top_n_index, top_n_indices) # n 個のコサイン類似度の取得．<class 'numpy.float64'>
#         if new_cs_max <= cs_value:
#             print(f"コサイン類似度 {cs_value} の質問文書はありません。検索ワードを変えてやり直してください。")
#             break
#         if new_n_cs > cs_value:
#             print(f"質問データ #{new_top_n_index}, コサイン類似度: {new_n_cs}, '{new_texts[new_top_n_index]}'")
#         #     # print(f"#{new_top_n_index} コサイン類似度: {new_n_cs}")
#             counter = counter + 1
#             continue
#         if new_n_cs <= cs_value:
#             print(f"コサイン類似度 {cs_value} 以上の質問文書は {counter} 件です。\n")
#             break

def print_new_words(new_top_n_indices, new_cs, new_texts, new_cs_max):

    # print(new_n_cs)
    cs_value = 0.2
    cs_selector(new_top_n_indices, new_cs, new_texts, new_cs_max, cs_value)
    cs_value = 0.3
    cs_selector(new_top_n_indices, new_cs, new_texts, new_cs_max, cs_value)
    cs_value = 0.4
    cs_selector(new_top_n_indices, new_cs, new_texts, new_cs_max, cs_value)
    # cs_value = 0.5
    # cs_selector(new_top_n_indices, new_cs, new_texts, new_cs_max, cs_value)
    # cs_value = 0.6
    # cs_selector(new_top_n_indices, new_cs, new_texts, new_cs_max, cs_value)
    # cs_value = 0.7
    # cs_selector(new_top_n_indices, new_cs, new_texts, new_cs_max, cs_value)
    # cs_value = 0.8
    # cs_selector(new_top_n_indices, new_cs, new_texts, new_cs_max, cs_value)
#
# def print_new_words(top_n_indices, new_cs, new_texts, new_cs_max):
#
#     # print(new_n_cs)
#     cs_value = 0.2
#     cs_selector(top_n_indices, new_cs, new_texts, new_cs_max, cs_value)
#     cs_value = 0.3
#     cs_selector(top_n_indices, new_cs, new_texts, new_cs_max, cs_value)
#     cs_value = 0.4
#     cs_selector(top_n_indices, new_cs, new_texts, new_cs_max, cs_value)

def listing_query(query): # 質問文書を queries に格納
    list_query = []
    list_query.append(query) # 配列 queries に格納
    return list_query

## dic.csv

          
            ado
            *
            *
            10
            名詞
            固有名詞
            一般
            *
            *
            *
            ado
            ado
            ado

            
              All
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              all apps
              all
              all

            
              apex
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              Apex One
              apex
              apex

            
              Apex
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              Apex One
              apex
              apex

            
              APEX
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              Apex One
              apex
              apex

            
              apexone
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              Apex One
              apex
              apex

            
              APEXONE
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              Apex One
              apex
              apex

            
              ApexOne
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              Apex One
              apex
              apex

            
              Apps
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              Apex One
              apex
              apex

            
              eduroam
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              eduroam
              eduroam
              eduroam

            
              Eduroam
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              eduroam
              eduroam
              eduroam

            
              EDUROAM
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              eduroam
              eduroam
              eduroam

            
              Forms
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              Forms
              forms
              forms

            
              list
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              list
              list
              list

            
              logout
              *
              *
              10
              名詞
              一般
              一般
              *
              *
              *
              ログアウト
              ログアウト
              ログアウト

            
              mathmatica
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              mathmatica
              mathmatica
              mathmatica

            
              Mathmatica
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              mathmatica
              mathmatica
              mathmatica

            
              MATHMATICA
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              mathmatica
              mathmatica
              mathmatica

            
              matlab
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              matlab
              mathlab
              mathlab

            
              MATHLAB
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              matlab
              mathlab
              mathlab

            
              Matlab
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              matlab
              mathlab
              mathlab

            
              office
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              Office
              オフィス
              オフィス

            
              Office
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              Office
              オフィス
              オフィス

            
              ocu
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              ocu
              ocu
              ocu

            
              Ocu
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              ocu
              ocu
              ocu

            
              OCU
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              ocu
              ocu
              ocu

            
              ocuid
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              ocuid
              ocuid
              ocuid

            
              OCUID
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              ocuid
              ocuid
              ocuid

            
              OCUNET
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              ocunet
              ocunet
              ocunet

            
              ocunet
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              ocunet
              ocunet
              ocunet

            
              Pro
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              pro
              pro
              pro

            
              pro
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              pro
              pro
              pro

            
              PRO
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              pro
              pro
              pro

            
              Publisher
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              publisher
              publisher
              publisher

            
              Teams
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              teams
              teams
              teams

            
              teams
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              teams
              teams
              teams

            
              TEAMS
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              teams
              teams
              teams

            
              TrendMicro
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              トレンドマイクロ
              トレンドマイクロ
              トレンドマイクロ

            
              trendmicro
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              トレンドマイクロ
              トレンドマイクロ
              トレンドマイクロ

            
              Trendmicro
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              トレンドマイクロ
              トレンドマイクロ
              トレンドマイクロ

            
              TRENDMICRO
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              トレンドマイクロ
              トレンドマイクロ
              トレンドマイクロ

            
              unipa
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              ユニバーサルパスポート
              ユニパ
              ユニパ

            
              Unipa
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              ユニバーサルパスポート
              ユニパ
              ユニパ

            
              UNIPA
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              ユニバーサルパスポート
              ユニパ
              ユニパ

            
              ユニパ
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              ユニバーサルパスポート
              ユニパ
              ユニパ

            
              ユニバーサルパスポート
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              ユニバーサルパスポート
              ユニバーサルパスポート
              ユニバーサルパスポート

            
              update
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              アップデート
              アップデート
              アップデート

            
              Update
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              アップデート
              アップデート
              アップデート

            
              UPDATE
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              アップデート
              アップデート
              アップデート

            
              WEB
              *
              *
              10
              名詞
              一般
              一般
              *
              *
              *
              web
              ウェブ
              ウェブ

            
              Web
              *
              *
              10
              名詞
              一般
              一般
              *
              *
              *
              web
              ウェブ
              ウェブ

            
              WebAuth
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              webauth
              webauth
              webauth

            
              webauth
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              webauth
              webauth
              webauth

            
              WebEx
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              webex
              webex
              webex

            
              webex
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              webex
              webex
              webex

            
              Wifi
              *
              *
              10
              名詞
              一般
              一般
              *
              *
              *
              Wi-Fi
              ワイファイ
              ワイファイ

            
              WINDOWS
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              Windows
              ウィンドウズ
              ウィンドウズ

            
              WIndows
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              Windows
              ウィンドウズ
              ウィンドウズ

            
              windows
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              Windows
              ウィンドウズ
              ウィンドウズ

            
              ZOOM
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              Zoom
              ズーム
              ズーム

            
              聞蔵
              *
              *
              10
              名詞
              固有名詞
              一般
              *
              *
              *
              聞蔵
              きくぞう
              きくぞう

## test-first-query.csv

          
            Questions
            First query
            
            cs >= 0.2
            satisfy(0.2)
            cs >= 0.3
            satisfy(0.3)
            cs >= 0.4
            satisfy(0.4)
            MAX CS
            satisfy(0.2) = 1
            satisfy(0.3) = 1
            satisfy(0.4) = 1

            
              （非常勤講師）メールアドレスが欲しい
              メールアドレス
              
              12
              1
              9
              1
              8
              1
              0.61
              0.61
              0.61
              0.61

            
              eduroamの認証方法が分からない
              eduroam
              
              10
              1
              10
              1
              9
              1
              0.77
              0.77
              0.77
              0.77

            
              eduroamの認証方法が分からない
              eduroam
              1

            
              MACアドレスを調べる
              MACアドレス
              
              1
              0
              1
              0
              1
              0
              0.53

            
              Officeのインストール方法が分からない
              インストール出来ない
              
              29
              1
              15
              1
              4
              0
              0.66
              0.66
              0.66

            
              Officeのインストール方法が分からない
              officeのインストール
              
              22
              1
              15
              1
              7
              1
              0.74
              0.74
              0.74
              0.74

            
              Teamsの設定方法がわからない
              Teams
              
              7
              1
              6
              1
              6
              1
              0.66
              0.66
              0.66
              0.66

            
              ThunderbirdにOCUメールを設定したい
              Thunderbird
              
              7
              1
              6
              1
              5
              1
              0.54
              0.54
              0.54
              0.54

            
              ThunderbirdにOCUメールを設定したい
              Thunderbird
              1

            
              Unipaのマニュアルが欲しい
              Unipa
              
              1
              1
              1
              1
              1
              1
              0.4
              0.4
              0.4
              0.4

            
              VPNに接続できない
              vpn
              
              7
              1
              7
              1
              4
              1
              0.62
              0.62
              0.62
              0.62

            
              VPNに接続できない
              仮想ネットワーク
              
              22
              1
              15
              1
              12
              1
              0.79
              0.79
              0.79
              0.79

            
              VPNの接続方法がわからない
              仮想ネットワーク
              1

            
              VPNの接続方法がわからない
              vpn
              1

            
              VPNの接続方法がわからない
              vpn
              1

            
              Wi-Fiに接続できない
              wi-fi
              
              10
              1
              9
              1
              9
              1
              0.79
              0.79
              0.79
              0.79

            
              Wi-Fiに接続できない
              wi-fi
              1

            
              Wi-Fiのパスワードがわからない
              wi-fi
              1

            
              Windows10Proが欲しい
              Windows10Pro
              
              4
              1
              3
              1
              2
              1
              0.71
              0.71
              0.71
              0.71

            
              Cisco WebEXの有料アカウントが欲しい
              WebEx
              
              1
              1
              1
              1
              1
              1
              0.57
              0.57
              0.57
              0.57

            
              Zoomの有料アカウントが欲しい
              zoom
              
              15
              1
              12
              1
              8
              1
              0.67
              0.67
              0.67
              0.67

            
              Zoomの有料アカウントが欲しい
              Zoom
              1

            
              Zoomの有料アカウントが欲しい
              zoom
              1

            
              Zoomのログインする方法を知りたい
              zoom
              1
              15
              1
              12
              1
              8
              1
              0.67
              0.67
              0.67
              0.67

            
              ウイルス対策ソフト ApexOneのインストール方法が分からない
              ウイルス対策ソフト
              
              12
              1
              7
              1
              6
              1
              0.81
              0.81
              0.81
              0.81

            
              ウイルス対策ソフト ApexOneのインストール方法が分からない
              Apexone
              
              11
              1
              11
              1
              7
              1
              0.81
              0.81
              0.81
              0.81

            
              ウイルス対策ソフト ApexOneのインストール方法が分からない
              TrendMicro
              
              8
              1
              8
              1
              1
              0
              0.46
              0.46
              0.46

            
              共有PCにOfficeをインストールしたい
              共有PC
              
              10
              1
              5
              1
              3
              1
              0.75
              0.75
              0.75
              0.75

            
              固定IPアドレスについて
              IPアドレス
              
              1
              1
              1
              1
              1
              1
              0.47
              0.47
              0.47
              0.47

            
              ネットワークに接続できない
              ネットワーク
              
              29
              1
              20
              1
              8
              1
              0.52
              0.52
              0.52
              0.52

            
              ネットワークに接続できない
              ネットワーク
              1

            
              ネットワークに接続できない
              ネットワーク
              1

            
              プリンターで印刷できない
              印刷できない
              
              15
              1
              7
              1
              4
              1
              0.82
              0.82
              0.82
              0.82

            
              プリンターで印刷できない
              繋がらない
              
              3
              0
              3
              0
              3
              0
              0.85

            
              プリンターで印刷できない
              接続出来ない
              
              33
              1
              17
              1
              6
              1
              0.77
              0.77
              0.77
              0.77

            
              プリンターで印刷できない
              プリンター
              
              4
              1
              4
              1
              4
              1
              0.67
              0.67
              0.67
              0.67

            
              プリンターで印刷できない
              プリンター
              1
              4
              1
              4
              1
              4
              1
              0.67
              0.67
              0.67
              0.67

            
              プリンターで印刷できない
              プリンター
              1
              4
              1
              4
              1
              4
              1
              0.67
              0.67
              0.67
              0.67

            
              名誉教授がVPNを使用したい
              名誉教授
              
              2
              1
              2
              1
              2
              1
              0.71
              0.71
              0.71
              0.71

            
              名誉教授向けのOCUIDについて
              名誉教授
              
              2
              1
              2
              1
              2
              1
              0.71
              0.71
              0.71
              0.71

            
              迷惑メールに分類されてしまう
              迷惑メール
              
              2
              1
              2
              1
              1
              1
              0.58
              0.58
              0.58
              0.58

            
              メーリングリストを差出人に設定したい
              メーリングリスト
              
              8
              1
              8
              1
              5
              1
              0.55
              0.55
              0.55
              0.55

            
              メーリングリストを差出人に設定したい
              メーリングリスト
              1

            
              メーリングリストを差出人に設定したい
              メーリングリスト
              1

            
              メールアドレス (@st.osaka-cu.ac.jp) が使用できない
              メールアドレス
              
              12
              1
              9
              0
              8
              0
              0.61
              0.61

            
              リモートデスクトップに接続できない
              リモートデスクトップ
              
              9
              1
              9
              1
              7
              1
              0.71
              0.71
              0.71
              0.71

            
              リモートデスクトップに接続できない
              リモートデスクトップ
              1

            
              全学認証パスワードが分からない
              全学認証パスワード
              
              25
              1
              9
              1
              6
              1
              0.85
              0.85
              0.85
              0.85

            
              全学認証仮パスワードがわからない
              全学認証
              
              19
              1
              13
              1
              8
              1
              0.72
              0.72
              0.72
              0.72

            
              全学認証パスワードの初期パスワードが分からない
              全学認証
              1

            
              18
              10.74285714
              0.942857143
              7.628571429
              0.914285714
              5
              0.857142857
              0.669714286
              0.668484848
              0.6703125
              0.677666667

## test-query.csv

          
            Questions
            First query
            Second query
            cs >= 0.2
            satisfy(0.2)
            cs >= 0.3
            satisfy(0.3)
            cs >= 0.4
            satisfy(0.4)
            最大CS
            satisfy(0.2) = 1
            satisfy(0.3) = 1
            satisfy(0.4) = 1

            
              （非常勤講師）メールアドレスが欲しい
              メールアドレス
              付与
              1
              1
              1
              1
              1
              1
              0.68
              0.68
              0.68
              0.68

            
              eduroamの認証方法が分からない
              eduroam
              接続方法
              4
              1
              4
              1
              4
              1
              1
              1
              1
              1

            
              eduroamの認証方法が分からない
              eduroam
              設定
              2
              1
              2
              1
              2
              1
              0.78
              0.78
              0.78
              0.78

            
              MACアドレスを調べる
              MACアドレス
              調べる
              1
              0
              1
              0
              1
              0
              0.35

            
              Officeのインストール方法が分からない
              インストール出来ない
              office
              7
              1
              5
              1
              3
              1
              0.64
              0.64
              0.64
              0.64

            
              Officeのインストール方法が分からない
              officeのインストール
              方法
              10
              0
              5
              0
              2
              0
              0.63

            
              Teamsの設定方法がわからない
              Teams
              設定
              6
              1
              3
              1
              0
              1
              0.39
              0.39
              0.39
              0.39

            
              ThunderbirdにOCUメールを設定したい
              Thunderbird
              設定方法
              5
              1
              5
              1
              4
              1
              0.53
              0.53
              0.53
              0.53

            
              ThunderbirdにOCUメールを設定したい
              Thunderbird
              OCUメール
              7
              1
              5
              1
              5
              1
              0.77
              0.77
              0.77
              0.77

            
              Unipaのマニュアルが欲しい
              Unipa
              マニュアル
              1
              1
              1
              1
              1
              1
              0.53
              0.53
              0.53
              0.53

            
              VPNに接続できない
              vpn
              繋がらない
              5
              1
              1
              1
              0
              0
              0.32
              0.32
              0.32

            
              VPNに接続できない
              仮想ネットワーク
              接続
              5
              0
              1
              0
              1
              0
              0.45

            
              VPNの接続方法がわからない
              仮想ネットワーク
              登録方法
              10
              1
              5
              1
              4
              1
              0.91
              0.91
              0.91
              0.91

            
              VPNの接続方法がわからない
              vpn
              接続方法
              4
              1
              4
              1
              2
              1
              0.62
              0.62
              0.62
              0.62

            
              VPNの接続方法がわからない
              vpn
              登録方法
              5
              1
              1
              1
              0
              0
              0.33
              0.33
              0.33
              0.33

            
              Wi-Fiに接続できない
              wi-fi
              接続
              3
              0
              3
              0
              2
              0
              0.52

            
              Wi-Fiに接続できない
              wi-fi
              繋がらない
              0
              0
              0
              0
              0
              0
              0.6

            
              Wi-Fiのパスワードがわからない
              wi-fi
              パスワード
              4
              1
              4
              1
              3
              1
              0.57
              0.57
              0.57
              0.57

            
              Windows10Proが欲しい
              Windows10Pro
              欲しい
              2
              1
              2
              1
              2
              1
              0.51
              0.51
              0.51
              0.51

            
              Cisco WebEXの有料アカウントが欲しい
              WebEx
              有料
              1
              1
              1
              1
              1
              1
              0.63
              0.63
              0.63
              0.63

            
              Zoomの有料アカウントが欲しい
              zoom
              有料アカウント
              7
              1
              3
              1
              2
              1
              0.49
              0.49
              0.49
              0.49

            
              Zoomの有料アカウントが欲しい
              Zoom
              有償アカウント
              7
              1
              4
              1
              2
              1
              0.53
              0.53
              0.53
              0.53

            
              Zoomの有料アカウントが欲しい
              zoom
              学生
              1
              1
              1
              1
              1
              1
              0.67
              0.67
              0.67
              0.67

            
              Zoomのログインする方法を知りたい
              zoom
              ログイン
              5
              1
              5
              1
              4
              1
              0.72
              0.72
              0.72
              0.72

            
              ウイルス対策ソフト ApexOneのインストール方法が分からない
              ウイルス対策ソフト
              インストール
              6
              1
              5
              1
              3
              1
              0.79
              0.79
              0.79
              0.79

            
              ウイルス対策ソフト ApexOneのインストール方法が分からない
              Apexone
              インストール
              7
              1
              5
              1
              2
              1
              0.67
              0.67
              0.67
              0.67

            
              ウイルス対策ソフト ApexOneのインストール方法が分からない
              TrendMicro
              ダウンロード出来ない
              3
              1
              3
              1
              2
              1
              0.6
              0.6
              0.6
              0.6

            
              共有PCにOfficeをインストールしたい
              共有PC
              office
              4
              1
              3
              1
              3
              1
              0.67
              0.67
              0.67
              0.67

            
              固定IPアドレスについて
              IPアドレス
              固定
              1
              1
              1
              1
              1
              1
              0.53
              0.53
              0.53
              0.53

            
              ネットワークに接続できない
              ネットワーク
              使用出来ない
              2
              1
              2
              1
              1
              1
              0.5
              0.5
              0.5
              0.5

            
              ネットワークに接続できない
              ネットワーク
              使えない
              1
              0
              1
              0
              1
              0
              0.43

            
              ネットワークに接続できない
              ネットワーク
              切れる
              9
              1
              0
              0
              0
              0
              0.28
              0.28

            
              プリンターで印刷できない
              印刷できない
              プリンター
              4
              1
              2
              1
              2
              1
              0.81
              0.81
              0.81
              0.81

            
              プリンターで印刷できない
              繋がらない
              プリンター
              3
              0
              1
              0
              1
              0
              0.51

            
              プリンターで印刷できない
              接続出来ない
              プリンター
              15
              1
              4
              0
              1
              0
              0.63
              0.63

            
              プリンターで印刷できない
              プリンター
              印刷できない
              1
              1
              1
              1
              1
              1
              0.87
              0.87
              0.87
              0.87

            
              プリンターで印刷できない
              プリンター
              繋がらない
              6
              1
              2
              1
              1
              1
              0.4
              0.4
              0.4
              0.4

            
              プリンターで印刷できない
              プリンター
              接続出来ない
              1
              1
              1
              1
              1
              1
              0.4
              0.4
              0.4
              0.4

            
              名誉教授がVPNを使用したい
              名誉教授
              vpn
              1
              1
              1
              1
              1
              1
              0.66
              0.66
              0.66
              0.66

            
              名誉教授向けのOCUIDについて
              名誉教授
              OCUID
              2
              1
              1
              1
              1
              1
              0.66
              0.66
              0.66
              0.66

            
              迷惑メールに分類されてしまう
              迷惑メール
              OCUメール
              1
              1
              1
              1
              1
              1
              0.66
              0.66
              0.66
              0.66

            
              メーリングリストを差出人に設定したい
              メーリングリスト
              設定
              3
              1
              3
              1
              3
              1
              0.63
              0.63
              0.63
              0.63

            
              メーリングリストを差出人に設定したい
              メーリングリスト
              Thunderbird
              1
              1
              1
              1
              1
              1
              0.6
              0.6
              0.6
              0.6

            
              メーリングリストを差出人に設定したい
              メーリングリスト
              差出人
              4
              1
              4
              1
              4
              1
              0.53
              0.53
              0.53
              0.53

            
              メールアドレス (@st.osaka-cu.ac.jp) が使用できない
              メールアドレス
              使用出来ない
              2
              1
              2
              1
              1
              1
              0.52
              0.52
              0.52
              0.52

            
              リモートデスクトップに接続できない
              リモートデスクトップ
              繋がらない
              1
              1
              1
              1
              1
              1
              0.54
              0.54
              0.54
              0.54

            
              リモートデスクトップに接続できない
              リモートデスクトップ
              接続出来ない
              4
              1
              3
              1
              3
              1
              0.58
              0.58
              0.58
              0.58

            
              全学認証パスワードが分からない
              全学認証パスワード
              分からない
              8
              1
              7
              1
              6
              1
              0.63
              0.63
              0.63
              0.63

            
              全学認証パスワードの初期パスワードが分からない
              全学認証
              仮パスワード
              6
              1
              5
              1
              5
              1
              0.67
              0.67
              0.67
              0.67

            
              全学認証パスワードの初期パスワードが分からない
              全学認証
              初期パスワード
              5
              1
              5
              1
              4
              1
              0.89
              0.89
              0.89
              0.89

            
              Average
              4.08
              0.86
              2.64
              0.82
              1.96
              0.78
              0.5966
              0.61255814
              0.620243902
              0.62775

## test10.py
import importlib
# importlib.reload()
import argparse
import numpy as np
import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from IPython.display import display

import pandas as pd

import MeCab
tagger = MeCab.Tagger("-Ochasen")
import mojimoji
import os
import urllib

from collections import Counter
import collections

text_paths = glob.glob('data/ocu2/*.txt')
from func import all
# from func import q_get
# from func import a_get
# from func import load_stopwords
# from func import preprocess
# from func import get_cs
# from func import find_top_n
# from func import get_n_cs
# from func import listing_query

def main(args):
    texts = all.q_get(text_paths)
    a_texts = all.a_get(text_paths)
    query_texts = all.listing_query(args.query)
    # query_texts = all.listing_query("VPNが繋がらない")
    q_series = pd.Series(texts)
    query_series = pd.Series(query_texts)
    processed_q_series = all.preprocess(q_series, ['名詞', '固有名詞', '動詞', '形容詞'])
    str_data = processed_q_series
    processed_query_series = all.preprocess(query_series, ['名詞', '固有名詞', '動詞', '形容詞'])
    str_query = processed_query_series
    cs = all.get_cs(str_query, str_data)
    # nonzero_indices = all.get_cs_words(str_query, str_data, cs)
    # for nonzero_index in nonzero_indices:
        # print(q_series[nonzero_index])
    # n = 295
    n = all.get_len_series(str_data) # データセット文書数
    top_n_indices = all.find_top_n(n, cs) # コサイン類似度の取得

    #
    new_series = all.get_new_words(cs, texts)
    # print(new_series)

    # print(new_texts)
    max_index = np.argmax(cs)
    max_cs = cs[max_index][0]
    if max_cs > 1e-10:
        new_texts =[]
        # print(f"該当する質問番号: {top_n_indices}")
        # print(f"{lst_top_n}") # np.array 形式から通常の list 形式に変換したものを表示
        # print(f"配列個数:{len(cs)}, コサイン類似度: {max_cs}, 配列番号: {max_index}, 類似度最大単語: '{max_data}'")
        # for top_n_index in top_n_indices: # 結果の表示
        #     n_cs = all.get_n_cs(cs, top_n_index, top_n_indices) # 各コサイン類似度の取得
        #     if n_cs > 0.2:
        #         print(f"質問データ #{top_n_index}, コサイン類似度: {n_cs}, '{texts[top_n_index]}'")
        #         new_text = texts[top_n_index]
        #         new_texts.append(new_text) # 提示されているデータを新しい質問文書リストに格納
        #         continue
        #     if n_cs <= 1e-10:
        #         break
        new_top_n_indices = top_n_indices
        new_cs = cs
        new_texts = texts
        new_cs_max = max_cs
        all.print_new_words(new_top_n_indices, new_cs, new_texts, new_cs_max)
        # new_max_cs = new_cs[new_cs_max][0] # コサイン類似度最大値
        print(f"コサイン類似度最大番号: {new_top_n_indices[0]}, コサイン類似度: {max_cs}")

    else:
        print("NotFound")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("query", type=str)
    args = parser.parse_args()
    main(args)

## test9.py
import importlib
# importlib.reload()
import argparse
import numpy as np
import glob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from IPython.display import display

import pandas as pd

import MeCab
tagger = MeCab.Tagger("-Ochasen")
import mojimoji
import os
import urllib

from collections import Counter
import collections

text_paths = glob.glob('data/ocu2/*.txt')
from func import all
# from func import q_get
# from func import a_get
# from func import load_stopwords
# from func import preprocess
# from func import get_cs
# from func import find_top_n
# from func import get_n_cs
# from func import listing_query

def main(args):
    texts = all.q_get(text_paths)
    a_texts = all.a_get(text_paths)
    query_texts = all.listing_query(args.query)
    # query_texts = all.listing_query("VPNが繋がらない")
    q_series = pd.Series(texts)
    query_series = pd.Series(query_texts)
    processed_q_series = all.preprocess(q_series, ['名詞', '固有名詞', '動詞', '形容詞'])
    str_data = processed_q_series
    processed_query_series = all.preprocess(query_series, ['名詞', '固有名詞', '動詞', '形容詞'])
    str_query = processed_query_series
    cs = all.get_cs(str_query, str_data)
    # nonzero_indices = all.get_cs_words(str_query, str_data, cs)
    # for nonzero_index in nonzero_indices:
        # print(q_series[nonzero_index])
    # n = 295
    n = all.get_len_series(str_data) # データセット文書数
    top_n_indices = all.find_top_n(n, cs) # コサイン類似度の取得

    #
    new_series = all.get_new_words(cs, texts)
    # print(new_series)

    # print(new_texts)
    max_index = np.argmax(cs)
    max_cs = cs[max_index][0]
    if max_cs > 1e-10:
        new_texts =[]
        # print(f"該当する質問番号: {top_n_indices}")
        # print(f"{lst_top_n}") # np.array 形式から通常の list 形式に変換したものを表示
        # print(f"配列個数:{len(cs)}, コサイン類似度: {max_cs}, 配列番号: {max_index}, 類似度最大単語: '{max_data}'")
        for top_n_index in top_n_indices: # 結果の表示
            n_cs = all.get_n_cs(cs, top_n_index, top_n_indices) # 各コサイン類似度の取得
            if n_cs > 1e-10:
                print(f"質問データ #{top_n_index}, コサイン類似度: {n_cs}, '{texts[top_n_index]}'")
                new_text = texts[top_n_index]
                new_texts.append(new_text) # 提示されているデータを新しい質問文書リストに格納
                continue
            if n_cs <= 1e-10:
                break

        # print(new_texts)
        new_series_count = all.select_topic(new_series) # 言葉の出現回数の取得
        # print(new_series_count) # 言葉の出現回数の表示

        new_texts_series = pd.Series(new_texts) # 1度選ばれた質問文書を pandas のシリーズにする
        processed_new_texts_series = all.preprocess(new_texts_series, ['名詞', '固有名詞', '動詞', '形容詞']) # 1度選ばれた質問文書を処理し，名詞，固有名詞，動詞，形容詞のみにする
        # print(type(processed_new_texts_series))
        new_input = all.new_input() # 提案されたワードを入力させる
        new_str_query = all.new_str_query(new_input, str_query) # はじめに入力されたクエリと後で追加されたクエリを合併させる
        # print(new_str_query)
        new_top_n_indices = all.new_question_answer(new_str_query, processed_new_texts_series, n) # 質問文書をクエリと提示された文書とのコサイン類似度を大きい順に並べる．
        # new_top_n_indices = all.new_question_answer(new_str_query, str_data, n)
        new_cs = all.get_cs(new_str_query, processed_new_texts_series) # 新しいクエリと新しい質問文書とのコサイン類似度の取得
        # new_cs = all.get_cs(new_str_query, str_data)
        new_cs_max = np.argmax(new_cs) # コサイン類似度最大値の配列番号
        all.print_new_words(new_top_n_indices, new_cs, new_texts, new_cs_max)
        new_max_cs = new_cs[new_cs_max][0] # コサイン類似度最大値
        print(f"コサイン類似度最大番号: {new_cs_max}, コサイン類似度: {new_max_cs}")

    else:
        print("NotFound")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("query", type=str)
    args = parser.parse_args()
    main(args)
	import pandas as pd
	import MeCab
	tagger = MeCab.Tagger("-Ochasen")
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	import numpy as np

	from collections import Counter
	import collections
	import itertools

	def q_get(text_paths): # 質問文書を text に格納
	texts = []
	for text_path in text_paths:
	text = open(text_path, 'r').read()
	text = text.split(',') # CSV ファイルのセルで分割
	text = ' '.join(text[8:9]) # 質問文書部分
	text = text.replace( '\n' , '' ) # 質問文書の改行を削除
	text = text.strip('"') # CSV ファイルのセル " を削除
	# text = text.replace('する', '') # するできるの削除（不要？）
	# text = text.replace('できる', '')
	texts.append(text) # 配列 texts に格納

	return texts

	def a_get(text_paths): # 回答文書を text に格納
	a_texts = []
	for text_path in text_paths:
	a_text = open(text_path, 'r').read()
	a_text = a_text.split(',') # CSV ファイルのセルで分割
	a_text = ' '.join(a_text[16:17]) # 質問文書部分
	# a_text = a_text.replace( '\n' , '' ) # 質問文書の改行を削除．読みにくいのでやっぱり不要
	a_text = a_text.strip('"') # CSV ファイルのセル " を削除
	a_texts.append(a_text) # 配列 a_texts に格納

	return a_texts

	def load_stopwords(path="data/jp_stop_words.txt"): # ストップワードの読み込み外部ファイル
	url = 'http://svn.sourceforge.jp/svnroot/slothlib/CSharp/Version1/SlothLib/NLP/Filter/StopWord/word/Japanese.txt'
	# if os.path.exists(path):
	# print('ストップワードの読み込み完了')
	# else:
	# print('ストップワードのダウンロード中')
	# urllib.request.urlretrieve(url, path)
	return pd.read_csv(path, header=None)[0].tolist()

	# def preprocess(series, flags = ['名詞', '固有名詞', '動詞', '形容詞']): # 前処理
	# stop_words = load_stopwords() # ストップワードの削除
	# def tokenizer_func(text): # MeCab で名詞，動詞，形容動詞のみを残す処理する部分
	# tokens = []
	# node = tagger.parseToNode(str(text))
	# while node:
	# features = node.feature.split(',') # MeCab 辞書はコンマ区切りなので，コンマで分割
	# surface = features[6] # MeCab 辞書の6番目の言葉の原型を抽出
	# if (surface == '*') or (len(surface) < 2) or (surface in stop_words): # 知らない言葉は無視
	# node = node.next
	# continue
	#
	# if (features[0] == '名詞') & ('名詞' in flags): # MeCab での名詞かつ名詞という flag の付いたものを抽出
	# tokens.append(surface)
	# elif ((features[0] == '名詞') & (features[1] == '固有名詞')) & ('固有名詞' in flags): # MeCab での固有名詞かつ固有名詞という flag の付いたものを抽出
	# tokens.append(surface)
	# elif ((features[0] == '動詞') & (features[1] == '自立')) & ('動詞' in flags): # MeCab での動詞（自立）かつ動詞という flag の付いたものを抽出
	# tokens.append(surface)
	# elif ((features[0] == '形容詞') & (features[1] == '自立')) & ('形容詞' in flags): # MeCab での形容詞かつ形容詞という flag の付いたものを抽出
	# tokens.append(surface)
	#
	# # noun_flag = (features[0] == '名詞')
	# # proper_noun_flag = (features[0] == '名詞') & (features[1] == '固有名詞')
	# # verb_flag = (features[0] == '動詞') & (features[1] == '自立')
	# # adjective_flag = (features[0] == '形容詞') & (features[1] == '自立')
	# # if proper_noun_flag:
	# # tokens.append(surface)
	# # elif noun_flag:
	# # tokens.append(surface)
	# # elif verb_flag:
	# # tokens.append(surface)
	# # elif adjective_flag:
	# # tokens.append(surface)
	#
	# node = node.next
	# return " ".join(tokens)
	#
	# series = series.map(tokenizer_func)
	#
	# #---------------Normalization-----------#
	# series = series.map(lambda x: x.lower()) # 小文字に統一
	# # series = series.map(mojimoji.zen_to_han, kana=False) # 半角に（カタカナ除く）統一．なんか動かないし不要．
	#
	# return series
	def preprocess(series, flags = ['名詞', '固有名詞', '動詞', '形容詞']): # 前処理
	stop_words = load_stopwords() # ストップワードの削除
	def tokenizer_func(text): # MeCab で名詞，動詞，形容動詞のみを残す処理する部分
	tokens = []
	node = tagger.parseToNode(str(text))
	while node:
	features = node.feature.split(',') # MeCab 辞書はコンマ区切りなので，コンマで分割
	surface = features[6] # MeCab 辞書の6番目の言葉の原型を抽出
	# if surface == '*': # 知らない言葉を表示
	# print(node.surface)
	if (surface == '*') or (len(surface) < 2) or (surface in stop_words): # 知らない言葉は無視
	node = node.next
	continue
	elif (features[0] == '名詞') & ('名詞' in flags): # MeCab での名詞かつ名詞という flag の付いたものを抽出
	tokens.append(surface)
	elif ((features[0] == '名詞') & (features[1] == '固有名詞')) & ('固有名詞' in flags): # MeCab での固有名詞かつ固有名詞という flag の付いたものを抽出
	tokens.append(surface)
	elif ((features[0] == '動詞') & (features[1] == '自立')) & ('動詞' in flags): # MeCab での動詞（自立）かつ動詞という flag の付いたものを抽出
	tokens.append(surface)
	elif ((features[0] == '形容詞') & (features[1] == '自立')) & ('形容詞' in flags): # MeCab での形容詞かつ形容詞という flag の付いたものを抽出
	tokens.append(surface)

	# noun_flag = (features[0] == '名詞')
	# proper_noun_flag = (features[0] == '名詞') & (features[1] == '固有名詞')
	# verb_flag = (features[0] == '動詞') & (features[1] == '自立')
	# adjective_flag = (features[0] == '形容詞') & (features[1] == '自立')
	# if proper_noun_flag:
	# tokens.append(surface)
	# elif noun_flag:
	# tokens.append(surface)
	# elif verb_flag:
	# tokens.append(surface)
	# elif adjective_flag:
	# tokens.append(surface)

	node = node.next
	return " ".join(tokens)

	series = series.map(tokenizer_func)

	#---------------Normalization-----------#
	series = series.map(lambda x: x.lower()) # 小文字に統一
	# series = series.map(mojimoji.zen_to_han, kana=False) # 半角に（カタカナ除く）統一．なんか動かないし不要．

	return series
	# query_preprocess は不要．
	# def query_preprocess(query_series): # 前処理
	# stop_words = load_stopwords() # ストップワードの削除
	# def tokenizer_func(text): # MeCab で名詞，動詞，形容動詞のみを残す処理する部分
	# tokens = []
	# node = tagger.parseToNode(str(text))
	# while node:
	# features = node.feature.split(',') # MeCab 辞書はコンマ区切りなので，コンマで分割
	# surface = features[6] # MeCab 辞書の6番目の言葉の原型を抽出
	# if (surface == '*') or (len(surface) < 2) or (surface in stop_words): # 知らない言葉は無視
	# node = node.next
	# continue
	# noun_flag = (features[0] == '名詞')
	# proper_noun_flag = (features[0] == '名詞') & (features[1] == '固有名詞')
	# verb_flag = (features[0] == '動詞') & (features[1] == '自立')
	# adjective_flag = (features[0] == '形容詞') & (features[1] == '自立')
	# if proper_noun_flag:
	# tokens.append(surface)
	# elif noun_flag:
	# tokens.append(surface)
	# elif verb_flag:
	# tokens.append(surface)
	# elif adjective_flag:
	# tokens.append(surface)
	# node = node.next
	# return " ".join(tokens)
	#
	# query_series = query_series.map(tokenizer_func)
	# # query_series = tokenizer_func(query_series)
	#
	# #---------------Normalization-----------#
	# query_series = query_series.map(lambda x: x.lower()) # 小文字に統一
	# # series = series.map(mojimoji.zen_to_han, kana=False) # 半角に（カタカナ除く）統一．なんか動かないし不要．
	# return query_series

	def question_vector(series): # 質問文書を Tf-Idf を用いて数値化
	tfidf = TfidfVectorizer()
	question_vector = tfidf.fit_transform(series).toarray() #コサイン類似度の評価
	query_vector = tfidf.transform(query_series).toarray() # 入力された質問を Tf-Idf を用いて数値化
	# 複数の返り値 https://pg-chain.com/python-function-return#toc3
	return question_vector, query_vector

	def get_cs(query_series, series): # 質問文書を MeCab で処理したあとのものをコサイン類似度を評価．
	tfidf = TfidfVectorizer() # Tf-Idf 化関数に名前を付ける
	question_vector = tfidf.fit_transform(series).toarray() # 質問文書を Tf-Idf を用いて数値化
	# print(len(question_vector[0]))
	query_vector = tfidf.transform(query_series).toarray() # 入力された質問を Tf-Idf を用いて数値化
	cs = cosine_similarity(question_vector, query_vector) # コサイン類似度の評価
	# print(len(query_vector[0]))
	return cs # それぞれのコサイン類似度を評価

	def get_len_series(series): # 質問文書の単語の総量を求める
	tfidf = TfidfVectorizer()
	question_vector = tfidf.fit_transform(series).toarray() #コサイン類似度の評価

	return len(question_vector) # それぞれのコサイン類似度を評価

	def find_top_n(n, cs): # コサイン類似度上から順に n 件の配列番号を取得する
	arr_top_n_indices = np.argsort(cs, axis = None)[-n:]
	top_n_indices = arr_top_n_indices[::-1] # 降順にソート
	return top_n_indices # top_n_indices は n 個の配列，一つ一つは番号

	def get_n_cs(cs, top_n_index, top_n_indices): # 配列番号 top_n_index 番目のコサイン類似度の取得
	for n_cs in top_n_indices:
	n_cs = cs[top_n_index][0]
	return n_cs

	####
	# 今困ってるのはコサイン類似度0より大きいものの文書に出てくる単語を抽出し表示，その単語を選択させるプログラムが出来ない
	# 1. 単語の抽出
	# 2. 単語の表示
	# 3. 単語の選択（単語自体の入力もしくは番号で選択させる）
	# 4. 文書を1つに絞るまでやる

	# 同じ内容の文書（似たような文書）がある．その辺の扱いは一旦保留


	# def get_cs_words(query_series, series, texts):
	def get_new_words(cs, texts): # コサイン類似度の高い質問文書から，名詞と固有名詞のみを抽出する．
	new_texts = [] # 配列
	new_series = {} # pandas の series の形式
	# print(type(cs.nonzero()))
	new_nums = cs.nonzero()[0] # コサイン類似度の行列の1行目だけ欲しい
	for new_num in new_nums:
	# print(texts[new_num])
	new_texts.append(texts[new_num]) # 配列に要素を追加
	# print(new_texts)
	# # for new_text in new_texts: #不要
	# # new_text = texts[new_num]
	# # print(new_text)
	# # new_texts.append(new_text)
	# # new_texts = texts[new_nums[0]]
	# new_list = listing_query(texts[new_num])
	new_texts_pd = pd.Series(new_texts) # pandas の series に new_texts を格納
	# new_series = all.preprocess(new_texts_pd, ['名詞', '固有名詞']) # (消さない)対話型でやるときはこっち．all.〜入り
	new_series = preprocess(new_texts_pd, ['名詞', '固有名詞']) # 名詞と固有名詞だけを抽出
	# # print(new_texts)
	# print(new_series)
	print(type(new_series))
	return new_series

	# new_q_series = pd.Series(texts[top_n_indices])

	# tfidf = TfidfVectorizer() # Tf-Idf 化関数に名前を付ける
	# question_vector = tfidf.fit_transform(series).toarray() # 質問文書を Tf-Idf を用いて数値化
	# query_vector = tfidf.transform(query_series).toarray() # 入力された質問を Tf-Idf を用いて数値化
	# # print(query_vector)
	# # cs_nonzero = query_vector.nonzero()
	# # nonzero_indices = np.argwhere(cs != 0)
	# # for nonzero_index in nonzero_indices:
	# # # print(texts[nonzero_index])
	# # print(nonzero_index)
	# # return series[nonzero_index]
	# nonzero_indices = tfidf.inverse_transform(query_vector)
	# print(nonzero_indices)
	# n_cs
	# return nonzero_indices

	# やること
	# 3. 単語の選択（単語自体の入力もしくは番号で選択させる）
	# 4. 文書を1つに絞るまでやる

	def select_topic(new_series): # 類似の質問文書を絞るために series から単語を選ばせる
	new_series_value = []
	new_series_values = []
	new_series_values = new_series.str.split(' ') # 半角スペースで区切る
	new_series_value = sum(new_series_values, []) # 単語の出現回数を数えるために，2次元配列を1次元にした

	# print(new_series_values)

	new_series_count = collections.Counter(new_series_value) # 単語の出現回数を出力

	print(f"{len(new_series_count)}件のワードが見つかりました")
	print(f"{set(new_series_count)}")
	return new_series_count

	def new_input(): # 提案したワードを入力させる
	new_input_raw = input('上の中から近いワードを選んでください．複数選択する場合は半角スペースで区切って入力してください．: ') # 入力を小文字にする
	# new_query = new_input.split(' ')
	# new_query = new_input
	new_input_pd = pd.Series(new_input_raw)
	new_input = preprocess(new_input_pd, ['名詞', '固有名詞', '動詞', '形容詞'])
	return new_input

	# def new_question(new_query_pd, str_query):
	def new_str_query(new_input, str_query): # str_query に新たなクエリを付与したもの
	str_query_str = str_query[0] # pandas の1行目データ抜き出す．str_query 自体を str 形式にしてしまうと "dtype:object" も入ってしまうため．
	str_query_str = str_query_str + ' ' + new_input
	# print(str_query_str)
	# pd series これを処理
	new_str_query = pd.Series(str_query_str)
	# print(new_str_query)

	# new_str_query_raw = pd.Series(str_query_str)
	# new_str_query_pd_str = str_query.str + new_query_pd.str
	# new_str_query = all.preprocess(new_str_query_raw, ['名詞', '固有名詞', '動詞', '形容詞'])
	# print(new_str_query)
	return new_str_query # 新しい情報を付加したクエリ

	def new_question_answer(new_str_query, processed_new_texts_series, n): # 新しいクエリと1度選ばれた質問文書のコサイン類似度の上位の文書番号集合を取得
	new_cs = get_cs(new_str_query, processed_new_texts_series) # 新しいクエリと1度選ばれた質問文書のコサイン類似度の取得
	new_top_n_indices = find_top_n(n, new_cs) # コサイン類似度の高い順に配列番号を並び替える
	return new_top_n_indices # コサイン類似度の配列番号を出力

	def cs_selector(new_top_n_indices, new_cs, new_texts, new_cs_max, cs_value):
	counter = 0
	print(f"コサイン類似度 {cs_value} 以上の文書は以下の通りです。")
	for new_top_n_index in new_top_n_indices: # 結果の表示
	new_n_cs = get_n_cs(new_cs, new_top_n_index, new_top_n_indices) # n 個のコサイン類似度の取得．<class 'numpy.float64'>
	if new_n_cs > cs_value:
	print(f"質問データ #{new_top_n_index}, コサイン類似度: {new_n_cs}, '{new_texts[new_top_n_index]}'")
	# # print(f"#{new_top_n_index} コサイン類似度: {new_n_cs}")
	counter = counter + 1
	continue
	if new_n_cs <= cs_value:
	if counter == 0:
	print(f"コサイン類似度 {cs_value} 以上の質問文書はありません。検索ワードを変えてやり直してください。")
	break
	else:
	print(f"コサイン類似度 {cs_value} 以上の質問文書は {counter} 件です。\n")
	break


	# def cs_selector(top_n_indices, new_cs, new_texts, new_cs_max, cs_value):
	# counter = 0
	# print(f"コサイン類似度 {cs_value} 以上の文書は以下の通りです。")
	# for new_top_n_index in top_n_indices: # 結果の表示
	# new_n_cs = get_n_cs(new_cs, new_top_n_index, top_n_indices) # n 個のコサイン類似度の取得．<class 'numpy.float64'>
	# if new_cs_max <= cs_value:
	# print(f"コサイン類似度 {cs_value} の質問文書はありません。検索ワードを変えてやり直してください。")
	# break
	# if new_n_cs > cs_value:
	# print(f"質問データ #{new_top_n_index}, コサイン類似度: {new_n_cs}, '{new_texts[new_top_n_index]}'")
	# # # print(f"#{new_top_n_index} コサイン類似度: {new_n_cs}")
	# counter = counter + 1
	# continue
	# if new_n_cs <= cs_value:
	# print(f"コサイン類似度 {cs_value} 以上の質問文書は {counter} 件です。\n")
	# break

	def print_new_words(new_top_n_indices, new_cs, new_texts, new_cs_max):

	# print(new_n_cs)
	cs_value = 0.2
	cs_selector(new_top_n_indices, new_cs, new_texts, new_cs_max, cs_value)
	cs_value = 0.3
	cs_selector(new_top_n_indices, new_cs, new_texts, new_cs_max, cs_value)
	cs_value = 0.4
	cs_selector(new_top_n_indices, new_cs, new_texts, new_cs_max, cs_value)
	# cs_value = 0.5
	# cs_selector(new_top_n_indices, new_cs, new_texts, new_cs_max, cs_value)
	# cs_value = 0.6
	# cs_selector(new_top_n_indices, new_cs, new_texts, new_cs_max, cs_value)
	# cs_value = 0.7
	# cs_selector(new_top_n_indices, new_cs, new_texts, new_cs_max, cs_value)
	# cs_value = 0.8
	# cs_selector(new_top_n_indices, new_cs, new_texts, new_cs_max, cs_value)
	#
	# def print_new_words(top_n_indices, new_cs, new_texts, new_cs_max):
	#
	# # print(new_n_cs)
	# cs_value = 0.2
	# cs_selector(top_n_indices, new_cs, new_texts, new_cs_max, cs_value)
	# cs_value = 0.3
	# cs_selector(top_n_indices, new_cs, new_texts, new_cs_max, cs_value)
	# cs_value = 0.4
	# cs_selector(top_n_indices, new_cs, new_texts, new_cs_max, cs_value)

	def listing_query(query): # 質問文書を queries に格納
	list_query = []
	list_query.append(query) # 配列 queries に格納
	return list_query
ado	*	*	10	名詞	固有名詞	一般	*	*	*	ado	ado	ado
All	*	*	10	名詞	固有名詞	一般	*	*	*	all apps	all	all
apex	*	*	10	名詞	固有名詞	一般	*	*	*	Apex One	apex	apex
Apex	*	*	10	名詞	固有名詞	一般	*	*	*	Apex One	apex	apex
APEX	*	*	10	名詞	固有名詞	一般	*	*	*	Apex One	apex	apex
apexone	*	*	10	名詞	固有名詞	一般	*	*	*	Apex One	apex	apex
APEXONE	*	*	10	名詞	固有名詞	一般	*	*	*	Apex One	apex	apex
ApexOne	*	*	10	名詞	固有名詞	一般	*	*	*	Apex One	apex	apex
Apps	*	*	10	名詞	固有名詞	一般	*	*	*	Apex One	apex	apex
eduroam	*	*	10	名詞	固有名詞	一般	*	*	*	eduroam	eduroam	eduroam
Eduroam	*	*	10	名詞	固有名詞	一般	*	*	*	eduroam	eduroam	eduroam
EDUROAM	*	*	10	名詞	固有名詞	一般	*	*	*	eduroam	eduroam	eduroam
Forms	*	*	10	名詞	固有名詞	一般	*	*	*	Forms	forms	forms
list	*	*	10	名詞	固有名詞	一般	*	*	*	list	list	list
logout	*	*	10	名詞	一般	一般	*	*	*	ログアウト	ログアウト	ログアウト
mathmatica	*	*	10	名詞	固有名詞	一般	*	*	*	mathmatica	mathmatica	mathmatica
Mathmatica	*	*	10	名詞	固有名詞	一般	*	*	*	mathmatica	mathmatica	mathmatica
MATHMATICA	*	*	10	名詞	固有名詞	一般	*	*	*	mathmatica	mathmatica	mathmatica
matlab	*	*	10	名詞	固有名詞	一般	*	*	*	matlab	mathlab	mathlab
MATHLAB	*	*	10	名詞	固有名詞	一般	*	*	*	matlab	mathlab	mathlab
Matlab	*	*	10	名詞	固有名詞	一般	*	*	*	matlab	mathlab	mathlab
office	*	*	10	名詞	固有名詞	一般	*	*	*	Office	オフィス	オフィス
Office	*	*	10	名詞	固有名詞	一般	*	*	*	Office	オフィス	オフィス
ocu	*	*	10	名詞	固有名詞	一般	*	*	*	ocu	ocu	ocu
Ocu	*	*	10	名詞	固有名詞	一般	*	*	*	ocu	ocu	ocu
OCU	*	*	10	名詞	固有名詞	一般	*	*	*	ocu	ocu	ocu
ocuid	*	*	10	名詞	固有名詞	一般	*	*	*	ocuid	ocuid	ocuid
OCUID	*	*	10	名詞	固有名詞	一般	*	*	*	ocuid	ocuid	ocuid
OCUNET	*	*	10	名詞	固有名詞	一般	*	*	*	ocunet	ocunet	ocunet
ocunet	*	*	10	名詞	固有名詞	一般	*	*	*	ocunet	ocunet	ocunet
Pro	*	*	10	名詞	固有名詞	一般	*	*	*	pro	pro	pro
pro	*	*	10	名詞	固有名詞	一般	*	*	*	pro	pro	pro
PRO	*	*	10	名詞	固有名詞	一般	*	*	*	pro	pro	pro
Publisher	*	*	10	名詞	固有名詞	一般	*	*	*	publisher	publisher	publisher
Teams	*	*	10	名詞	固有名詞	一般	*	*	*	teams	teams	teams
teams	*	*	10	名詞	固有名詞	一般	*	*	*	teams	teams	teams
TEAMS	*	*	10	名詞	固有名詞	一般	*	*	*	teams	teams	teams
TrendMicro	*	*	10	名詞	固有名詞	一般	*	*	*	トレンドマイクロ	トレンドマイクロ	トレンドマイクロ
trendmicro	*	*	10	名詞	固有名詞	一般	*	*	*	トレンドマイクロ	トレンドマイクロ	トレンドマイクロ
Trendmicro	*	*	10	名詞	固有名詞	一般	*	*	*	トレンドマイクロ	トレンドマイクロ	トレンドマイクロ
TRENDMICRO	*	*	10	名詞	固有名詞	一般	*	*	*	トレンドマイクロ	トレンドマイクロ	トレンドマイクロ
unipa	*	*	10	名詞	固有名詞	一般	*	*	*	ユニバーサルパスポート	ユニパ	ユニパ
Unipa	*	*	10	名詞	固有名詞	一般	*	*	*	ユニバーサルパスポート	ユニパ	ユニパ
UNIPA	*	*	10	名詞	固有名詞	一般	*	*	*	ユニバーサルパスポート	ユニパ	ユニパ
ユニパ	*	*	10	名詞	固有名詞	一般	*	*	*	ユニバーサルパスポート	ユニパ	ユニパ
ユニバーサルパスポート	*	*	10	名詞	固有名詞	一般	*	*	*	ユニバーサルパスポート	ユニバーサルパスポート	ユニバーサルパスポート
update	*	*	10	名詞	固有名詞	一般	*	*	*	アップデート	アップデート	アップデート
Update	*	*	10	名詞	固有名詞	一般	*	*	*	アップデート	アップデート	アップデート
UPDATE	*	*	10	名詞	固有名詞	一般	*	*	*	アップデート	アップデート	アップデート
WEB	*	*	10	名詞	一般	一般	*	*	*	web	ウェブ	ウェブ
Web	*	*	10	名詞	一般	一般	*	*	*	web	ウェブ	ウェブ
WebAuth	*	*	10	名詞	固有名詞	一般	*	*	*	webauth	webauth	webauth
webauth	*	*	10	名詞	固有名詞	一般	*	*	*	webauth	webauth	webauth
WebEx	*	*	10	名詞	固有名詞	一般	*	*	*	webex	webex	webex
webex	*	*	10	名詞	固有名詞	一般	*	*	*	webex	webex	webex
Wifi	*	*	10	名詞	一般	一般	*	*	*	Wi-Fi	ワイファイ	ワイファイ
WINDOWS	*	*	10	名詞	固有名詞	一般	*	*	*	Windows	ウィンドウズ	ウィンドウズ
WIndows	*	*	10	名詞	固有名詞	一般	*	*	*	Windows	ウィンドウズ	ウィンドウズ
windows	*	*	10	名詞	固有名詞	一般	*	*	*	Windows	ウィンドウズ	ウィンドウズ
ZOOM	*	*	10	名詞	固有名詞	一般	*	*	*	Zoom	ズーム	ズーム
聞蔵	*	*	10	名詞	固有名詞	一般	*	*	*	聞蔵	きくぞう	きくぞう
Questions	First query		cs >= 0.2	satisfy(0.2)	cs >= 0.3	satisfy(0.3)	cs >= 0.4	satisfy(0.4)	MAX CS	satisfy(0.2) = 1	satisfy(0.3) = 1	satisfy(0.4) = 1
（非常勤講師）メールアドレスが欲しい	メールアドレス		12	1	9	1	8	1	0.61	0.61	0.61	0.61
eduroamの認証方法が分からない	eduroam		10	1	10	1	9	1	0.77	0.77	0.77	0.77
eduroamの認証方法が分からない	eduroam	1
MACアドレスを調べる	MACアドレス		1	0	1	0	1	0	0.53
Officeのインストール方法が分からない	インストール出来ない		29	1	15	1	4	0	0.66	0.66	0.66
Officeのインストール方法が分からない	officeのインストール		22	1	15	1	7	1	0.74	0.74	0.74	0.74
Teamsの設定方法がわからない	Teams		7	1	6	1	6	1	0.66	0.66	0.66	0.66
ThunderbirdにOCUメールを設定したい	Thunderbird		7	1	6	1	5	1	0.54	0.54	0.54	0.54
ThunderbirdにOCUメールを設定したい	Thunderbird	1
Unipaのマニュアルが欲しい	Unipa		1	1	1	1	1	1	0.4	0.4	0.4	0.4
VPNに接続できない	vpn		7	1	7	1	4	1	0.62	0.62	0.62	0.62
VPNに接続できない	仮想ネットワーク		22	1	15	1	12	1	0.79	0.79	0.79	0.79
VPNの接続方法がわからない	仮想ネットワーク	1
VPNの接続方法がわからない	vpn	1
VPNの接続方法がわからない	vpn	1
Wi-Fiに接続できない	wi-fi		10	1	9	1	9	1	0.79	0.79	0.79	0.79
Wi-Fiに接続できない	wi-fi	1
Wi-Fiのパスワードがわからない	wi-fi	1
Windows10Proが欲しい	Windows10Pro		4	1	3	1	2	1	0.71	0.71	0.71	0.71
Cisco WebEXの有料アカウントが欲しい	WebEx		1	1	1	1	1	1	0.57	0.57	0.57	0.57
Zoomの有料アカウントが欲しい	zoom		15	1	12	1	8	1	0.67	0.67	0.67	0.67
Zoomの有料アカウントが欲しい	Zoom	1
Zoomの有料アカウントが欲しい	zoom	1
Zoomのログインする方法を知りたい	zoom	1	15	1	12	1	8	1	0.67	0.67	0.67	0.67
ウイルス対策ソフト ApexOneのインストール方法が分からない	ウイルス対策ソフト		12	1	7	1	6	1	0.81	0.81	0.81	0.81
ウイルス対策ソフト ApexOneのインストール方法が分からない	Apexone		11	1	11	1	7	1	0.81	0.81	0.81	0.81
ウイルス対策ソフト ApexOneのインストール方法が分からない	TrendMicro		8	1	8	1	1	0	0.46	0.46	0.46
共有PCにOfficeをインストールしたい	共有PC		10	1	5	1	3	1	0.75	0.75	0.75	0.75
固定IPアドレスについて	IPアドレス		1	1	1	1	1	1	0.47	0.47	0.47	0.47
ネットワークに接続できない	ネットワーク		29	1	20	1	8	1	0.52	0.52	0.52	0.52
ネットワークに接続できない	ネットワーク	1
ネットワークに接続できない	ネットワーク	1
プリンターで印刷できない	印刷できない		15	1	7	1	4	1	0.82	0.82	0.82	0.82
プリンターで印刷できない	繋がらない		3	0	3	0	3	0	0.85
プリンターで印刷できない	接続出来ない		33	1	17	1	6	1	0.77	0.77	0.77	0.77
プリンターで印刷できない	プリンター		4	1	4	1	4	1	0.67	0.67	0.67	0.67
プリンターで印刷できない	プリンター	1	4	1	4	1	4	1	0.67	0.67	0.67	0.67
プリンターで印刷できない	プリンター	1	4	1	4	1	4	1	0.67	0.67	0.67	0.67
名誉教授がVPNを使用したい	名誉教授		2	1	2	1	2	1	0.71	0.71	0.71	0.71
名誉教授向けのOCUIDについて	名誉教授		2	1	2	1	2	1	0.71	0.71	0.71	0.71
迷惑メールに分類されてしまう	迷惑メール		2	1	2	1	1	1	0.58	0.58	0.58	0.58
メーリングリストを差出人に設定したい	メーリングリスト		8	1	8	1	5	1	0.55	0.55	0.55	0.55
メーリングリストを差出人に設定したい	メーリングリスト	1
メーリングリストを差出人に設定したい	メーリングリスト	1
メールアドレス (@st.osaka-cu.ac.jp) が使用できない	メールアドレス		12	1	9	0	8	0	0.61	0.61
リモートデスクトップに接続できない	リモートデスクトップ		9	1	9	1	7	1	0.71	0.71	0.71	0.71
リモートデスクトップに接続できない	リモートデスクトップ	1
全学認証パスワードが分からない	全学認証パスワード		25	1	9	1	6	1	0.85	0.85	0.85	0.85
全学認証仮パスワードがわからない	全学認証		19	1	13	1	8	1	0.72	0.72	0.72	0.72
全学認証パスワードの初期パスワードが分からない	全学認証	1
		18	10.74285714	0.942857143	7.628571429	0.914285714	5	0.857142857	0.669714286	0.668484848	0.6703125	0.677666667
Questions	First query	Second query	cs >= 0.2	satisfy(0.2)	cs >= 0.3	satisfy(0.3)	cs >= 0.4	satisfy(0.4)	最大CS	satisfy(0.2) = 1	satisfy(0.3) = 1	satisfy(0.4) = 1
（非常勤講師）メールアドレスが欲しい	メールアドレス	付与	1	1	1	1	1	1	0.68	0.68	0.68	0.68
eduroamの認証方法が分からない	eduroam	接続方法	4	1	4	1	4	1	1	1	1	1
eduroamの認証方法が分からない	eduroam	設定	2	1	2	1	2	1	0.78	0.78	0.78	0.78
MACアドレスを調べる	MACアドレス	調べる	1	0	1	0	1	0	0.35
Officeのインストール方法が分からない	インストール出来ない	office	7	1	5	1	3	1	0.64	0.64	0.64	0.64
Officeのインストール方法が分からない	officeのインストール	方法	10	0	5	0	2	0	0.63
Teamsの設定方法がわからない	Teams	設定	6	1	3	1	0	1	0.39	0.39	0.39	0.39
ThunderbirdにOCUメールを設定したい	Thunderbird	設定方法	5	1	5	1	4	1	0.53	0.53	0.53	0.53
ThunderbirdにOCUメールを設定したい	Thunderbird	OCUメール	7	1	5	1	5	1	0.77	0.77	0.77	0.77
Unipaのマニュアルが欲しい	Unipa	マニュアル	1	1	1	1	1	1	0.53	0.53	0.53	0.53
VPNに接続できない	vpn	繋がらない	5	1	1	1	0	0	0.32	0.32	0.32
VPNに接続できない	仮想ネットワーク	接続	5	0	1	0	1	0	0.45
VPNの接続方法がわからない	仮想ネットワーク	登録方法	10	1	5	1	4	1	0.91	0.91	0.91	0.91
VPNの接続方法がわからない	vpn	接続方法	4	1	4	1	2	1	0.62	0.62	0.62	0.62
VPNの接続方法がわからない	vpn	登録方法	5	1	1	1	0	0	0.33	0.33	0.33	0.33
Wi-Fiに接続できない	wi-fi	接続	3	0	3	0	2	0	0.52
Wi-Fiに接続できない	wi-fi	繋がらない	0	0	0	0	0	0	0.6
Wi-Fiのパスワードがわからない	wi-fi	パスワード	4	1	4	1	3	1	0.57	0.57	0.57	0.57
Windows10Proが欲しい	Windows10Pro	欲しい	2	1	2	1	2	1	0.51	0.51	0.51	0.51
Cisco WebEXの有料アカウントが欲しい	WebEx	有料	1	1	1	1	1	1	0.63	0.63	0.63	0.63
Zoomの有料アカウントが欲しい	zoom	有料アカウント	7	1	3	1	2	1	0.49	0.49	0.49	0.49
Zoomの有料アカウントが欲しい	Zoom	有償アカウント	7	1	4	1	2	1	0.53	0.53	0.53	0.53
Zoomの有料アカウントが欲しい	zoom	学生	1	1	1	1	1	1	0.67	0.67	0.67	0.67
Zoomのログインする方法を知りたい	zoom	ログイン	5	1	5	1	4	1	0.72	0.72	0.72	0.72
ウイルス対策ソフト ApexOneのインストール方法が分からない	ウイルス対策ソフト	インストール	6	1	5	1	3	1	0.79	0.79	0.79	0.79
ウイルス対策ソフト ApexOneのインストール方法が分からない	Apexone	インストール	7	1	5	1	2	1	0.67	0.67	0.67	0.67
ウイルス対策ソフト ApexOneのインストール方法が分からない	TrendMicro	ダウンロード出来ない	3	1	3	1	2	1	0.6	0.6	0.6	0.6
共有PCにOfficeをインストールしたい	共有PC	office	4	1	3	1	3	1	0.67	0.67	0.67	0.67
固定IPアドレスについて	IPアドレス	固定	1	1	1	1	1	1	0.53	0.53	0.53	0.53
ネットワークに接続できない	ネットワーク	使用出来ない	2	1	2	1	1	1	0.5	0.5	0.5	0.5
ネットワークに接続できない	ネットワーク	使えない	1	0	1	0	1	0	0.43
ネットワークに接続できない	ネットワーク	切れる	9	1	0	0	0	0	0.28	0.28
プリンターで印刷できない	印刷できない	プリンター	4	1	2	1	2	1	0.81	0.81	0.81	0.81
プリンターで印刷できない	繋がらない	プリンター	3	0	1	0	1	0	0.51
プリンターで印刷できない	接続出来ない	プリンター	15	1	4	0	1	0	0.63	0.63
プリンターで印刷できない	プリンター	印刷できない	1	1	1	1	1	1	0.87	0.87	0.87	0.87
プリンターで印刷できない	プリンター	繋がらない	6	1	2	1	1	1	0.4	0.4	0.4	0.4
プリンターで印刷できない	プリンター	接続出来ない	1	1	1	1	1	1	0.4	0.4	0.4	0.4
名誉教授がVPNを使用したい	名誉教授	vpn	1	1	1	1	1	1	0.66	0.66	0.66	0.66
名誉教授向けのOCUIDについて	名誉教授	OCUID	2	1	1	1	1	1	0.66	0.66	0.66	0.66
迷惑メールに分類されてしまう	迷惑メール	OCUメール	1	1	1	1	1	1	0.66	0.66	0.66	0.66
メーリングリストを差出人に設定したい	メーリングリスト	設定	3	1	3	1	3	1	0.63	0.63	0.63	0.63
メーリングリストを差出人に設定したい	メーリングリスト	Thunderbird	1	1	1	1	1	1	0.6	0.6	0.6	0.6
メーリングリストを差出人に設定したい	メーリングリスト	差出人	4	1	4	1	4	1	0.53	0.53	0.53	0.53
メールアドレス (@st.osaka-cu.ac.jp) が使用できない	メールアドレス	使用出来ない	2	1	2	1	1	1	0.52	0.52	0.52	0.52
リモートデスクトップに接続できない	リモートデスクトップ	繋がらない	1	1	1	1	1	1	0.54	0.54	0.54	0.54
リモートデスクトップに接続できない	リモートデスクトップ	接続出来ない	4	1	3	1	3	1	0.58	0.58	0.58	0.58
全学認証パスワードが分からない	全学認証パスワード	分からない	8	1	7	1	6	1	0.63	0.63	0.63	0.63
全学認証パスワードの初期パスワードが分からない	全学認証	仮パスワード	6	1	5	1	5	1	0.67	0.67	0.67	0.67
全学認証パスワードの初期パスワードが分からない	全学認証	初期パスワード	5	1	5	1	4	1	0.89	0.89	0.89	0.89
		Average	4.08	0.86	2.64	0.82	1.96	0.78	0.5966	0.61255814	0.620243902	0.62775
	import importlib
	# importlib.reload()
	import argparse
	import numpy as np
	import glob
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity

	from IPython.display import display

	import pandas as pd

	import MeCab
	tagger = MeCab.Tagger("-Ochasen")
	import mojimoji
	import os
	import urllib

	from collections import Counter
	import collections

	text_paths = glob.glob('data/ocu2/*.txt')
	from func import all
	# from func import q_get
	# from func import a_get
	# from func import load_stopwords
	# from func import preprocess
	# from func import get_cs
	# from func import find_top_n
	# from func import get_n_cs
	# from func import listing_query

	def main(args):
	texts = all.q_get(text_paths)
	a_texts = all.a_get(text_paths)
	query_texts = all.listing_query(args.query)
	# query_texts = all.listing_query("VPNが繋がらない")
	q_series = pd.Series(texts)
	query_series = pd.Series(query_texts)
	processed_q_series = all.preprocess(q_series, ['名詞', '固有名詞', '動詞', '形容詞'])
	str_data = processed_q_series
	processed_query_series = all.preprocess(query_series, ['名詞', '固有名詞', '動詞', '形容詞'])
	str_query = processed_query_series
	cs = all.get_cs(str_query, str_data)
	# nonzero_indices = all.get_cs_words(str_query, str_data, cs)
	# for nonzero_index in nonzero_indices:
	# print(q_series[nonzero_index])
	# n = 295
	n = all.get_len_series(str_data) # データセット文書数
	top_n_indices = all.find_top_n(n, cs) # コサイン類似度の取得

	#
	new_series = all.get_new_words(cs, texts)
	# print(new_series)

	# print(new_texts)
	max_index = np.argmax(cs)
	max_cs = cs[max_index][0]
	if max_cs > 1e-10:
	new_texts =[]
	# print(f"該当する質問番号: {top_n_indices}")
	# print(f"{lst_top_n}") # np.array 形式から通常の list 形式に変換したものを表示
	# print(f"配列個数:{len(cs)}, コサイン類似度: {max_cs}, 配列番号: {max_index}, 類似度最大単語: '{max_data}'")
	# for top_n_index in top_n_indices: # 結果の表示
	# n_cs = all.get_n_cs(cs, top_n_index, top_n_indices) # 各コサイン類似度の取得
	# if n_cs > 0.2:
	# print(f"質問データ #{top_n_index}, コサイン類似度: {n_cs}, '{texts[top_n_index]}'")
	# new_text = texts[top_n_index]
	# new_texts.append(new_text) # 提示されているデータを新しい質問文書リストに格納
	# continue
	# if n_cs <= 1e-10:
	# break
	new_top_n_indices = top_n_indices
	new_cs = cs
	new_texts = texts
	new_cs_max = max_cs
	all.print_new_words(new_top_n_indices, new_cs, new_texts, new_cs_max)
	# new_max_cs = new_cs[new_cs_max][0] # コサイン類似度最大値
	print(f"コサイン類似度最大番号: {new_top_n_indices[0]}, コサイン類似度: {max_cs}")

	else:
	print("NotFound")

	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("query", type=str)
	args = parser.parse_args()
	main(args)