Skip to content

Instantly share code, notes, and snippets.

@jwlin jwlin/getTopic.py
Last active Oct 10, 2018

Embed
What would you like to do?
def getTopic(self, ai, imgtopic, Dict):
# JW: features 是準備要被預測 topic 的變數, 應該要挪到下方 prediction 開始前, 程式比較連貫
#extrat the features of the element
features = str(re.sub(' +', ' ', ' '.join(self.extract_features(ai, imgtopic, Dict, 1))))
#print (features)
# JW: 從這裡開始可以獨立切一個 function, 只呼叫一次, 把 train 好的 model 存起來
#open training data file
current_dir = os.path.dirname(_file_)
corpus_dir = os.path.join(current_dir, 'corpus', 'all-corpus')
answer = dict()
with open(os.path.join(current_dir, 'corpus', 'label-all-corpus.json'), 'r') as f:
data = json.load(f)
for k, v in data.items():
if v['feature'] in answer.keys():
assert answer[v['feature']] == v['type']
else:
answer[v['feature']] = v['type']
ids = list()
all_corpus = dict()
for fname in os.listdir(corpus_dir):
key = fname.split('-')[0]
ids.append(key)
all_corpus[key] = [line.lower().split() for line in open(os.path.join(corpus_dir, fname), 'r')]
correction = []
program_log_data = []
training_ids = ids[:]
# training
corpus = []
for t_id in training_ids:
corpus += all_corpus[t_id]
dictionary = corpora.Dictionary(corpus)
# common words and tokenize to remove
stoplist = set('your a the is and or in be to of for not on with as by'.split())
# remove stop words and words that appear only once
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
# once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
once_ids = []
dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
dictionary.compactify() # remove gaps in id sequence after words that were removed
corpus_bow = []
for t_id in training_ids:
corpus_bow += [dictionary.doc2bow(c) for c in all_corpus[t_id]]
tfidf = models.TfidfModel(corpus_bow) # tfidf
corpus_tfidf = tfidf[corpus_bow]
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=50) #find theSVD
corpus_lsi = lsi[corpus_tfidf]
index = similarities.MatrixSimilarity(corpus_lsi) # transform element's feature vector to latent vector space
# JW: 以上, training 結束,過程中的物件、模型,都可以存成檔案,下次直接呼叫,不用重新 train
# JW: 具體來說, dictionary, tfidf, lsi, index 四個變數都可以呼叫 save()/load() 方法以便後續存取
# JW: 使用說明參考官方文件:
# JW: https://radimrehurek.com/gensim/tut2.html
# JW: https://radimrehurek.com/gensim/tut3.html
training_topic = {}
for i in range(len(corpus)):
feature = ' '.join(corpus[i])
training_topic[str(i)] = {
'type': answer[feature],
'feature': feature
}
# JW: training_topic 建好之後,可以存成檔案,下次直接讀入作為對照表使用
# JW: 到此為止,training 結束,以上可以切成一個 function
# JW: 以下, prediction 開始, 可以自己獨立一個 function
# JW: 在這邊把之前存的 dictionary, tfidf, lsi, index, training_topic 先讀進來就可以了
num_total = 0
num_incorrect = 0
num_multiple_types = 0
d=features.lower().split()
num_total += 1
vec_bow = dictionary.doc2bow(d)
vec_tfidf = tfidf[vec_bow]
vec_lsi = lsi[vec_tfidf]
sims = index[vec_lsi] #caculate cosine similarity
sims = sorted(enumerate(sims), key=lambda item: -item[1])
#obtain vec_type, the topic of element
vec_type = training_topic[str(sims[0][0])]['type']
feature = ' '.join(d)
if (sims[0][1] - sims[4][1]) < 0.1:
topic_count = {}
for s in sims[:5]:
key = str(s[0])
if training_topic[key]['type'] in topic_count.keys():
topic_count[training_topic[key]['type']] += 1
else:
topic_count[training_topic[key]['type']] = 1
max_times = topic_count[max(topic_count, key=topic_count.get)]
max_types = { training_topic[str(v[0])]['type'] for v in sims[:5] \
if topic_count[training_topic[str(v[0])]['type']] == max_times }
if len(max_types) > 1:
num_multiple_types += 1
vec_type = random.choice(list(max_types))
if vec_type in CnUtil.queryVocalbulary():
return vec_type
else:
return '_TOPIC@unknown'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.