tirinox/example_defaultdict.py

## example_defaultdict.py
text = """
We develop a methodology for automatically analyzing text to aid in discriminating firms that encounter catastrophic
financial events. The dictionaries we create from Management Discussion and Analysis Sections (MD&A) of 10-Ks
discriminate fraudulent from non-fraudulent firms 75% of the time and bankrupt from nonbankrupt firms 80% of the
time. Our results compare favorably with quantitative prediction methods. We further test for complementarities by
merging quantitative data with text data. We achieve our best prediction results for both bankruptcy (83.87%) and
fraud (81.97%) with the combined data, showing that that the text of the MD&A complements the quantitative financial
information.
"""

key_words = [
    "quantitative",
    "results",
    "automatically"
]

# решение не претендует на общую эффективность, сделано для демонстрации

def solution1(text, keywords):
    # разбивка текста на слова и удаление лишних символов
    all_words = map(lambda word: word.strip(' .)(%\n').lower(), text.split(' '))

    kw_map = {}

    for word_no, word in enumerate(all_words):
        if word in key_words:
            if word in kw_map:
                kw_map[word].append(word_no)
            else:
                kw_map[word] = [word_no]

    return kw_map

from collections import defaultdict

def solution2(text, keywords):
    # разбивка текста на слова и удаление лишних символов
    all_words = map(lambda word: word.strip(' .)(%\n').lower(), text.split(' '))

    kw_map = defaultdict(list)

    for word_no, word in enumerate(all_words):
        if word in key_words:
            kw_map[word].append(word_no)

    return kw_map


print(solution2(text, key_words))
	text = """
	We develop a methodology for automatically analyzing text to aid in discriminating firms that encounter catastrophic
	financial events. The dictionaries we create from Management Discussion and Analysis Sections (MD&A) of 10-Ks
	discriminate fraudulent from non-fraudulent firms 75% of the time and bankrupt from nonbankrupt firms 80% of the
	time. Our results compare favorably with quantitative prediction methods. We further test for complementarities by
	merging quantitative data with text data. We achieve our best prediction results for both bankruptcy (83.87%) and
	fraud (81.97%) with the combined data, showing that that the text of the MD&A complements the quantitative financial
	information.
	"""

	key_words = [
	"quantitative",
	"results",
	"automatically"
	]

	# решение не претендует на общую эффективность, сделано для демонстрации

	def solution1(text, keywords):
	# разбивка текста на слова и удаление лишних символов
	all_words = map(lambda word: word.strip(' .)(%\n').lower(), text.split(' '))

	kw_map = {}

	for word_no, word in enumerate(all_words):
	if word in key_words:
	if word in kw_map:
	kw_map[word].append(word_no)
	else:
	kw_map[word] = [word_no]

	return kw_map

	from collections import defaultdict

	def solution2(text, keywords):
	# разбивка текста на слова и удаление лишних символов
	all_words = map(lambda word: word.strip(' .)(%\n').lower(), text.split(' '))

	kw_map = defaultdict(list)

	for word_no, word in enumerate(all_words):
	if word in key_words:
	kw_map[word].append(word_no)

	return kw_map


	print(solution2(text, key_words))