jwlin/getTopic.py

## getTopic.py
def getTopic(self, ai, imgtopic, Dict):
    # JW:　features 是準備要被預測 topic 的變數, 應該要挪到下方 prediction 開始前, 程式比較連貫
    #extrat the features of the element
    features = str(re.sub(' +', ' ', ' '.join(self.extract_features(ai, imgtopic, Dict, 1))))
    #print (features)

    # JW:　從這裡開始可以獨立切一個 function, 只呼叫一次, 把 train 好的 model 存起來
    #open training data file
    current_dir = os.path.dirname(_file_)
    corpus_dir = os.path.join(current_dir, 'corpus', 'all-corpus')
    answer = dict()
    with open(os.path.join(current_dir, 'corpus', 'label-all-corpus.json'), 'r') as f:
        data = json.load(f)
    for k, v in data.items():
        if v['feature'] in answer.keys():
            assert answer[v['feature']] == v['type']
        else:
            answer[v['feature']] = v['type']

    ids = list()
    all_corpus = dict()
    for fname in os.listdir(corpus_dir):
        key = fname.split('-')[0]
        ids.append(key)
        all_corpus[key] = [line.lower().split() for line in open(os.path.join(corpus_dir, fname), 'r')]

    correction = []
    program_log_data = []

    training_ids = ids[:]

    # training
    corpus = []
    for t_id in training_ids:
        corpus += all_corpus[t_id]
    dictionary = corpora.Dictionary(corpus)
    # common words and tokenize to remove
    stoplist = set('your a the is and or in be to of for not on with as by'.split())
    # remove stop words and words that appear only once
    stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
    # once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
    once_ids = []
    dictionary.filter_tokens(stop_ids + once_ids)  # remove stop words and words that appear only once
    dictionary.compactify()  # remove gaps in id sequence after words that were removed

    corpus_bow = []
    for t_id in training_ids:
        corpus_bow += [dictionary.doc2bow(c) for c in all_corpus[t_id]]

    tfidf = models.TfidfModel(corpus_bow)  # tfidf
    corpus_tfidf = tfidf[corpus_bow]
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=50)  #find theSVD
    corpus_lsi = lsi[corpus_tfidf]
    index = similarities.MatrixSimilarity(corpus_lsi)  # transform element's feature vector to latent vector space

    # JW:　以上, training 結束，過程中的物件、模型，都可以存成檔案，下次直接呼叫，不用重新 train
    # JW: 具體來說, dictionary, tfidf, lsi, index 四個變數都可以呼叫 save()/load() 方法以便後續存取
    # JW: 使用說明參考官方文件:
    # JW: https://radimrehurek.com/gensim/tut2.html
    # JW: https://radimrehurek.com/gensim/tut3.html

    training_topic = {}
    for i in range(len(corpus)):
        feature = ' '.join(corpus[i])
        training_topic[str(i)] = {
            'type': answer[feature],
            'feature': feature
        }
    # JW:　training_topic 建好之後，可以存成檔案，下次直接讀入作為對照表使用
    # JW:　到此為止，training 結束，以上可以切成一個 function

    # JW: 以下, prediction 開始, 可以自己獨立一個 function
    # JW: 在這邊把之前存的 dictionary, tfidf, lsi, index, training_topic 先讀進來就可以了
    num_total = 0
    num_incorrect = 0
    num_multiple_types = 0


    d=features.lower().split()
    num_total += 1
    vec_bow = dictionary.doc2bow(d)
    vec_tfidf = tfidf[vec_bow]
    vec_lsi = lsi[vec_tfidf]
    sims = index[vec_lsi]  #caculate cosine similarity
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    #obtain vec_type, the topic of element
    vec_type = training_topic[str(sims[0][0])]['type']
    feature = ' '.join(d)
    if (sims[0][1] - sims[4][1]) < 0.1:
        topic_count = {}
        for s in sims[:5]:
            key = str(s[0])
            if training_topic[key]['type'] in topic_count.keys():
                topic_count[training_topic[key]['type']] += 1
            else:
                topic_count[training_topic[key]['type']] = 1
        max_times = topic_count[max(topic_count, key=topic_count.get)]
        max_types = { training_topic[str(v[0])]['type'] for v in sims[:5] \
            if topic_count[training_topic[str(v[0])]['type']] == max_times }
        if len(max_types) > 1:
            num_multiple_types += 1
        vec_type = random.choice(list(max_types))
    if vec_type in CnUtil.queryVocalbulary():
        return vec_type
    else:
        return '_TOPIC@unknown'
	def getTopic(self, ai, imgtopic, Dict):
	# JW:　features 是準備要被預測 topic 的變數, 應該要挪到下方 prediction 開始前, 程式比較連貫
	#extrat the features of the element
	features = str(re.sub(' +', ' ', ' '.join(self.extract_features(ai, imgtopic, Dict, 1))))
	#print (features)

	# JW:　從這裡開始可以獨立切一個 function, 只呼叫一次, 把 train 好的 model 存起來
	#open training data file
	current_dir = os.path.dirname(_file_)
	corpus_dir = os.path.join(current_dir, 'corpus', 'all-corpus')
	answer = dict()
	with open(os.path.join(current_dir, 'corpus', 'label-all-corpus.json'), 'r') as f:
	data = json.load(f)
	for k, v in data.items():
	if v['feature'] in answer.keys():
	assert answer[v['feature']] == v['type']
	else:
	answer[v['feature']] = v['type']

	ids = list()
	all_corpus = dict()
	for fname in os.listdir(corpus_dir):
	key = fname.split('-')[0]
	ids.append(key)
	all_corpus[key] = [line.lower().split() for line in open(os.path.join(corpus_dir, fname), 'r')]

	correction = []
	program_log_data = []

	training_ids = ids[:]

	# training
	corpus = []
	for t_id in training_ids:
	corpus += all_corpus[t_id]
	dictionary = corpora.Dictionary(corpus)
	# common words and tokenize to remove
	stoplist = set('your a the is and or in be to of for not on with as by'.split())
	# remove stop words and words that appear only once
	stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
	# once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
	once_ids = []
	dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
	dictionary.compactify() # remove gaps in id sequence after words that were removed

	corpus_bow = []
	for t_id in training_ids:
	corpus_bow += [dictionary.doc2bow(c) for c in all_corpus[t_id]]

	tfidf = models.TfidfModel(corpus_bow) # tfidf
	corpus_tfidf = tfidf[corpus_bow]
	lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=50) #find theSVD
	corpus_lsi = lsi[corpus_tfidf]
	index = similarities.MatrixSimilarity(corpus_lsi) # transform element's feature vector to latent vector space

	# JW:　以上, training 結束，過程中的物件、模型，都可以存成檔案，下次直接呼叫，不用重新 train
	# JW: 具體來說, dictionary, tfidf, lsi, index 四個變數都可以呼叫 save()/load() 方法以便後續存取
	# JW: 使用說明參考官方文件:
	# JW: https://radimrehurek.com/gensim/tut2.html
	# JW: https://radimrehurek.com/gensim/tut3.html

	training_topic = {}
	for i in range(len(corpus)):
	feature = ' '.join(corpus[i])
	training_topic[str(i)] = {
	'type': answer[feature],
	'feature': feature
	}
	# JW:　training_topic 建好之後，可以存成檔案，下次直接讀入作為對照表使用
	# JW:　到此為止，training 結束，以上可以切成一個 function

	# JW: 以下, prediction 開始, 可以自己獨立一個 function
	# JW: 在這邊把之前存的 dictionary, tfidf, lsi, index, training_topic 先讀進來就可以了
	num_total = 0
	num_incorrect = 0
	num_multiple_types = 0


	d=features.lower().split()
	num_total += 1
	vec_bow = dictionary.doc2bow(d)
	vec_tfidf = tfidf[vec_bow]
	vec_lsi = lsi[vec_tfidf]
	sims = index[vec_lsi] #caculate cosine similarity
	sims = sorted(enumerate(sims), key=lambda item: -item[1])
	#obtain vec_type, the topic of element
	vec_type = training_topic[str(sims[0][0])]['type']
	feature = ' '.join(d)
	if (sims[0][1] - sims[4][1]) < 0.1:
	topic_count = {}
	for s in sims[:5]:
	key = str(s[0])
	if training_topic[key]['type'] in topic_count.keys():
	topic_count[training_topic[key]['type']] += 1
	else:
	topic_count[training_topic[key]['type']] = 1
	max_times = topic_count[max(topic_count, key=topic_count.get)]
	max_types = { training_topic[str(v[0])]['type'] for v in sims[:5] \
	if topic_count[training_topic[str(v[0])]['type']] == max_times }
	if len(max_types) > 1:
	num_multiple_types += 1
	vec_type = random.choice(list(max_types))
	if vec_type in CnUtil.queryVocalbulary():
	return vec_type
	else:
	return '_TOPIC@unknown'