Instantly share code, notes, and snippets.

Embed
What would you like to do?
text = """
We develop a methodology for automatically analyzing text to aid in discriminating firms that encounter catastrophic
financial events. The dictionaries we create from Management Discussion and Analysis Sections (MD&A) of 10-Ks
discriminate fraudulent from non-fraudulent firms 75% of the time and bankrupt from nonbankrupt firms 80% of the
time. Our results compare favorably with quantitative prediction methods. We further test for complementarities by
merging quantitative data with text data. We achieve our best prediction results for both bankruptcy (83.87%) and
fraud (81.97%) with the combined data, showing that that the text of the MD&A complements the quantitative financial
information.
"""
key_words = [
"quantitative",
"results",
"automatically"
]
# решение не претендует на общую эффективность, сделано для демонстрации
def solution1(text, keywords):
# разбивка текста на слова и удаление лишних символов
all_words = map(lambda word: word.strip(' .)(%\n').lower(), text.split(' '))
kw_map = {}
for word_no, word in enumerate(all_words):
if word in key_words:
if word in kw_map:
kw_map[word].append(word_no)
else:
kw_map[word] = [word_no]
return kw_map
from collections import defaultdict
def solution2(text, keywords):
# разбивка текста на слова и удаление лишних символов
all_words = map(lambda word: word.strip(' .)(%\n').lower(), text.split(' '))
kw_map = defaultdict(list)
for word_no, word in enumerate(all_words):
if word in key_words:
kw_map[word].append(word_no)
return kw_map
print(solution2(text, key_words))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment