Skip to content

Instantly share code, notes, and snippets.

View cxtadment's full-sized avatar

Xiaoting Cai cxtadment

View GitHub Profile
def removeTopic(content):
topics = re.findall(u"#[\w\u0000-\u9FFF]+#", content)
for i in range(0, len(topics)):
content = content.replace(topics[i], '')
return content
def pickle_words_features(microblogType):
microblogs = Microblog.objects(microblogType=microblogType)
all_words = []
for microblog in microblogs:
all_words.extend(microblog.words)
all_words = nltk.FreqDist(all_words)
words_features = list(all_words.keys())
def polarity_count(self, microblog_text):
seg_list = list(jieba.cut(microblog_text))
t = 0
while t < len(seg_list) - 1:
if seg_list[t] in ESCAPE_WORDS:
seg_list[t + 1] = seg_list[t] + seg_list[t + 1]
seg_list.pop(t)
t += 1
def pos_tagging(self, microblog_text):
words_taggings = pseg.cut(microblog_text)
words, taggings = [], []
for word, tagging in words_taggings:
if self.seg_filter(word, tagging):
words.append(word)
taggings.append(tagging)
return words, taggings
def seg_filter(self, word, tagging):
# filter stop words including punctuation
if word in self.stopwords:
return False
# filter element containing number
if re.match('^(?=.*\\d)', word):
return False
# if the word is in the topics
if word in self.topics:
return False
def removeBracket(content):
content = content.replace('(', '(')
content = content.replace(')', ')')
brackets = re.findall(u"\([\w\u0000-\u9FFF]+\)", content)
for i in range(0, len(brackets)):
content = content.replace(brackets[i], '')
return content
def removeForward(content):
forward_index = content.find("//")
if not forward_index == -1:
content = content[0:forward_index]
return content
def removePrivate(content):
privates = re.findall(u"@[\w\u0000-\u9FFF]+", content)
for i in range(0, len(privates)):
content = content.replace(privates[i], '')
return content
def convertPun(content):
punctuation_list = [',', '。', '?', '!', '……', ':', '「', '」', '.....', '】', ':', '、']
punctuation_list2 = ['《', '》', '“', '”', '"', '"']
for i in range(0, len(punctuation_list)):
content = content.replace(punctuation_list[i], '.')
content = content.replace('【', ' ')
for i in range(0, len(punctuation_list2)):
content = content.replace(punctuation_list2[i], '')
return content