icu0755/brief.md

## brief.md

      
    Raw
  

              brief.md
            
          
    The solution is good enough for data.txt provided.
In production there can be the way larger files,
thus I would implement a generator that reads an input file in chunks and
returns tokens that can be accumulated in a hash table instead of keeping everything in the memory.

  
## result.txt
computer: 17
and: 16
graphics: 12
the: 12
of: 12
a: 8
in: 6
to: 6
has: 5
is: 5
data: 4
been: 4
on: 4
have: 4
used: 4
for: 4
d: 4
are: 3
computers: 3
with: 3
dimensional: 3
visualization: 3
representation: 2
image: 2
by: 2
specialized: 2
graphic: 2
easier: 2
many: 2
types: 2
media: 2
term: 2
that: 2
or: 2
several: 2
different: 2
sub: 2
field: 2
science: 2
which: 2
studies: 2
methods: 2
digitally: 2
synthesizing: 2
manipulating: 2
visual: 2
content: 2
imagery: 2
example: 2
reports: 2
medical: 2
can: 2
other: 2
developed: 2
three: 2
as: 2
more: 2
created: 1
using: 1
specifically: 1
help: 1
from: 1
hardware: 1
software: 1
interaction: 1
understanding: 1
interpretation: 1
made: 1
because: 1
development: 1
had: 1
significant: 1
impact: 1
revolutionized: 1
animation: 1
movies: 1
video: 1
game: 1
industry: 1
overview: 1
broad: 1
sense: 1
describe: 1
almost: 1
everything: 1
not: 1
text: 1
sound: 1
typically: 1
refers: 1
things: 1
manipulation: 1
various: 1
technologies: 1
create: 1
manipulate: 1
images: 1
see: 1
study: 1
widespread: 1
today: 1
found: 1
television: 1
newspapers: 1
weather: 1
all: 1
kinds: 1
investigation: 1
surgical: 1
procedures: 1
well: 1
constructed: 1
graph: 1
present: 1
complex: 1
statistics: 1
form: 1
understand: 1
interpret: 1
such: 1
graphs: 1
illustrate: 1
papers: 1
thesis: 1
presentation: 1
material: 1
powerful: 1
tools: 1
visualize: 1
generated: 1
be: 1
categorized: 1
into: 1
two: 1
animated: 1
technology: 1
improved: 1
become: 1
common: 1
but: 1
still: 1
widely: 1
emerged: 1
over: 1
past: 1
decade: 1
fields: 1
like: 1
information: 1
scientific: 1
concerned: 1
phenomena: 1
architectural: 1
meteorological: 1
biological: 1
etc: 1
where: 1
emphasis: 1
realistic: 1
renderings: 1
volumes: 1
surfaces: 1
illumination: 1
sources: 1
so: 1
forth: 1
perhaps: 1
dynamic: 1
time: 1
component: 1

## task.py
import itertools
import operator
import re
from collections import Counter


def task1(fname):
    with open(fname, 'r') as f:
        lines = f.readlines()
        lines_lowercase = map(lambda line: line.lower(), lines)
        sep = re.compile('[^a-z]+')
        lines_words = map(lambda line: re.split(sep, line), lines_lowercase)
        words = itertools.chain.from_iterable(lines_words)
        words = filter(None, words)
        words_occurrences_unordered = list(Counter(words).items())
        words_occurrences_ordered = sorted(
            words_occurrences_unordered,
            key=operator.itemgetter(1),
            reverse=True
        )
        for word, occurrences in words_occurrences_ordered:
            print('{}: {}'.format(word, occurrences))


task1('data.txt')
	computer: 17
	and: 16
	graphics: 12
	the: 12
	of: 12
	a: 8
	in: 6
	to: 6
	has: 5
	is: 5
	data: 4
	been: 4
	on: 4
	have: 4
	used: 4
	for: 4
	d: 4
	are: 3
	computers: 3
	with: 3
	dimensional: 3
	visualization: 3
	representation: 2
	image: 2
	by: 2
	specialized: 2
	graphic: 2
	easier: 2
	many: 2
	types: 2
	media: 2
	term: 2
	that: 2
	or: 2
	several: 2
	different: 2
	sub: 2
	field: 2
	science: 2
	which: 2
	studies: 2
	methods: 2
	digitally: 2
	synthesizing: 2
	manipulating: 2
	visual: 2
	content: 2
	imagery: 2
	example: 2
	reports: 2
	medical: 2
	can: 2
	other: 2
	developed: 2
	three: 2
	as: 2
	more: 2
	created: 1
	using: 1
	specifically: 1
	help: 1
	from: 1
	hardware: 1
	software: 1
	interaction: 1
	understanding: 1
	interpretation: 1
	made: 1
	because: 1
	development: 1
	had: 1
	significant: 1
	impact: 1
	revolutionized: 1
	animation: 1
	movies: 1
	video: 1
	game: 1
	industry: 1
	overview: 1
	broad: 1
	sense: 1
	describe: 1
	almost: 1
	everything: 1
	not: 1
	text: 1
	sound: 1
	typically: 1
	refers: 1
	things: 1
	manipulation: 1
	various: 1
	technologies: 1
	create: 1
	manipulate: 1
	images: 1
	see: 1
	study: 1
	widespread: 1
	today: 1
	found: 1
	television: 1
	newspapers: 1
	weather: 1
	all: 1
	kinds: 1
	investigation: 1
	surgical: 1
	procedures: 1
	well: 1
	constructed: 1
	graph: 1
	present: 1
	complex: 1
	statistics: 1
	form: 1
	understand: 1
	interpret: 1
	such: 1
	graphs: 1
	illustrate: 1
	papers: 1
	thesis: 1
	presentation: 1
	material: 1
	powerful: 1
	tools: 1
	visualize: 1
	generated: 1
	be: 1
	categorized: 1
	into: 1
	two: 1
	animated: 1
	technology: 1
	improved: 1
	become: 1
	common: 1
	but: 1
	still: 1
	widely: 1
	emerged: 1
	over: 1
	past: 1
	decade: 1
	fields: 1
	like: 1
	information: 1
	scientific: 1
	concerned: 1
	phenomena: 1
	architectural: 1
	meteorological: 1
	biological: 1
	etc: 1
	where: 1
	emphasis: 1
	realistic: 1
	renderings: 1
	volumes: 1
	surfaces: 1
	illumination: 1
	sources: 1
	so: 1
	forth: 1
	perhaps: 1
	dynamic: 1
	time: 1
	component: 1
	import itertools
	import operator
	import re
	from collections import Counter


	def task1(fname):
	with open(fname, 'r') as f:
	lines = f.readlines()
	lines_lowercase = map(lambda line: line.lower(), lines)
	sep = re.compile('[^a-z]+')
	lines_words = map(lambda line: re.split(sep, line), lines_lowercase)
	words = itertools.chain.from_iterable(lines_words)
	words = filter(None, words)
	words_occurrences_unordered = list(Counter(words).items())
	words_occurrences_ordered = sorted(
	words_occurrences_unordered,
	key=operator.itemgetter(1),
	reverse=True
	)
	for word, occurrences in words_occurrences_ordered:
	print('{}: {}'.format(word, occurrences))


	task1('data.txt')