Last active
November 12, 2021 05:29
-
-
Save ryzhovau/3b692c3f0d8af84bd196b052c039b403 to your computer and use it in GitHub Desktop.
Выделение нормализованных названий городов из Telegram-сообщений и вывод наиболее часто встречающиеся. Использует библиотеку natasha для native language processing'а https://habr.com/ru/post/516098/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import json, re | |
from collections import Counter | |
from natasha import ( | |
Segmenter, | |
MorphVocab, | |
NewsEmbedding, | |
NewsMorphTagger, | |
NewsSyntaxParser, | |
NewsNERTagger, | |
PER, | |
NamesExtractor, | |
DatesExtractor, | |
MoneyExtractor, | |
AddrExtractor, | |
Doc | |
) | |
segmenter = Segmenter() | |
morph_vocab = MorphVocab() | |
emb = NewsEmbedding() | |
morph_tagger = NewsMorphTagger(emb) | |
syntax_parser = NewsSyntaxParser(emb) | |
ner_tagger = NewsNERTagger(emb) | |
names_extractor = NamesExtractor(morph_vocab) | |
dates_extractor = DatesExtractor(morph_vocab) | |
money_extractor = MoneyExtractor(morph_vocab) | |
addr_extractor = AddrExtractor(morph_vocab) | |
# Открываем JSON-экспорт из канала телеги | |
with open("UTM_Help.json", "r") as read_file: | |
data = json.load(read_file) | |
# Выбираем текстовые сообщения | |
messages = [] | |
for msg in data['messages']: | |
if msg['type'] == 'message': | |
if isinstance(msg['text'], str): | |
text = msg['text'] | |
else: | |
text = None | |
for i in msg['text']: | |
if isinstance(i, str): | |
text = i | |
break | |
if not text: | |
continue | |
text = text.replace('\n', ' ') | |
# Эти слова встречаются так часто, что прилипают к выделяемым NLP NER сущностям | |
for word in ('СОХ', 'Монолит', 'завод', 'Лид', 'Склад', 'РМ', 'УТМ'): | |
text = re.sub(r'(?i)\b{}\b'.format(word), '', text) | |
messages.append(text) | |
# Собираем список упомянутых городов после нахождения сущностей, после нормализации | |
cities = [] | |
for msg in messages: #[:100]: | |
try: | |
doc = Doc(msg) | |
doc.segment(segmenter) | |
doc.tag_morph(morph_tagger) | |
doc.parse_syntax(syntax_parser) | |
doc.tag_ner(ner_tagger) | |
for span in doc.spans: | |
span.normalize(morph_vocab) | |
if span.type == 'LOC': | |
#print(span.text, ' > ', span.normal.strip().lower().capitalize()) | |
cities.append(span.normal.strip().lower().capitalize()) | |
except: | |
pass | |
# Выводим топ упоминаний городов | |
cities_dict = {} | |
for i in cities: | |
if i in cities_dict: | |
cities_dict[i] += 1 | |
else: | |
cities_dict[i] = 1 | |
print('Среди', len(messages), 'сообщений геолокации упоминаются', len(cities), 'раз. Топ из них:') | |
for item in sorted(cities_dict, key=cities_dict.get, reverse=True)[:25]: | |
print(item, ':', cities_dict[item]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment