e3krisztian/class6.ipynb

## class6.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              class6.ipynb
            
          
        Loading

      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## class6.py

# coding: utf-8

# In[1]:

import os
print os.getcwd()
os.listdir('.')


# In[2]:

with open('data/private-cities.txt') as f:
    lines = f.readlines()


# In[3]:

# headlines 1
[line for line in lines if len(line) < 80]


# In[4]:

# headlines 2
[line for line in lines if '.' not in line]


# In[5]:

# beolvasas soronkent mashogy
with open('data/private-cities.txt') as f:
    sections = [line for line in f if len(line) < 80]


# In[6]:

sections


# In[7]:

# karakterek szama
with open('data/private-cities.txt') as f:
    print len(f.read())


# In[8]:

# sorok szama
with open('data/private-cities.txt') as f:
    print len(f.readlines())


# In[9]:

# szavak szama
with open('data/private-cities.txt') as f:
    words = f.read().split()

print len(words)


# In[10]:

# leghosszabb szo hossza
max_length = max(len(w) for w in words)
print max_length


# In[11]:

# leghosszabb szo/szavak
[word for word in words if len(word) == max_length]


# In[12]:

# jo az elozo megoldas?
long_words4 = [word for word in words if len(word) == max_length - 4]
long_words4


# In[13]:

# ismetlodesek!
set(long_words4)


# In[14]:

# leggyakoribb szavak
word_count = {}
for word in words:
    if word in word_count:
        previous_count = word_count[word]
    else:
        previous_count = 0
    word_count[word] = previous_count + 1

count_to_words = {}

for word, count in word_count.items():
    if count in count_to_words:
        word_list = count_to_words[count]
    else:
        word_list = []
        count_to_words[count] = word_list
    word_list.append(word)

highest_count = max(count_to_words)
print highest_count, count_to_words[highest_count]
high_counts = sorted(count_to_words, reverse=True)[:20]
print high_counts
for count in high_counts:
    print count, count_to_words[count]


# In[15]:

with open('data/private-cities.txt') as f:
    text = f.read()


# In[16]:

'!' in text


# In[17]:

'?' in text


# In[18]:

text.splitlines()[:5]


# In[19]:

print lines[5]


# In[20]:

# sentences
def sentences(lines):
    separations=(
        ('. ', '.'),
        ('? ', '?'),
    )
    for separator, tail in separations:
        sentences = []
        for line in lines:
            sentences += split_by_separator(line, separator, tail)
        lines = sentences
    return lines

def split_by_separator(line, separator, tail):
    fragments = line.split(separator)
    sentences = [
        s + tail
        for s in fragments[:-1]
    ] + fragments[-1:]
    return sentences

sentences(lines)[:20]


## Homework

# - Reformat the text, so that there are two spaces before sections, one after section start, and one between paragraphs.
# - Report the number of sentences per section.
# - Write sections into file named *{two-digit-section-number}-{section-name}.txt* e.g. *01-Private-Cities-101.txt*

## plan.txt
tools
	ipython
		- live environment
		- completion on names
		- help on names

	ipython notebook
		- ipython in browser
		- later in course

	http://pythontutor.com/visualize.html#mode=edit


dict (map, mapping)
	key -> value

	- keys are immutable - tuples vs lists
	- value - anything

	- create
		- empty: {}
		- {'a': 1, 1: 'a'}
		- dict(a=2, b=4)
		- dict(
				(
				(1, 2),
				(3, 4),
				(5, 6)
				)
			)
	- value access - normal indexing:
		d[key]
	- setting value:
		d[key] = value
	- deleting value:
		del d[key]
	- key existence check:
		key in d
	- get with default value:
		d.get(key, default_if_key_unknown)
	- all keys:
		d.keys()
	- iterate over keys:
		for key in d:
			d[key]
		# keys are NOT ordered:
		for key in dict(a=1, b=2, c=3, d=4): print key
	-  can be thought of as a finite function


set
	- values are immutable!

	.add
	.union
	.difference
	.intersection

set vs dict

sorting:
	- inplace
		list.sort()
	- new list
		sorted(iterable)

File IO
	reading:
		f.read()
		f.readlines()
		for line in f:
			...

	writing:
		f.write(what)
		f.flush()


Predefined files
	sys.stdin
	sys.stdout
	sys.stderr


FileSystem
	os.listdir(dir) -> filenames

	open(filename, mode) -> file
		file.read() -> text
		file.readlines() -> [lines]
		file.write(what)
		file.close()

	file.close()

	with open() as f:
		f.read()

	with open() as f:
		f.write()

	os.remove()
		shutil.rmtree()


??? where to get filenames ???
	- scripts embed in the source
	- tools
		- from command line
			sys.argv
		- known configuration file name

## typoglicemia.py
from random import shuffle

DOUBLE = (u'cs', u'dz', u'gy', u'ly', u'ny', u'sz', u'ty', u'zs')
TRIPLE = (u'dzs',)


def typoglicemia(text):
    words = split_to_words(text)
    typoglicemia_words = [word_typoglicemia(word) for word in words]
    return u' '.join(typoglicemia_words)


def word_typoglicemia(word):
    characters = convert_text_to_list(word)
    typoglicemia_characters = list_typoglicemia(characters)
    return u''.join(typoglicemia_characters)


def list_typoglicemia(characters):
    # shorter lists need not be reshuffled
    if len(characters) >= 4:
        middle = characters[1:-1]
        # 'shuffle' shuffles in place, does not return a value!
        shuffle(middle)
        return [characters[0]]+middle+[characters[-1]]
    else:
        return characters


def convert_text_to_list(text):
    # never forget to stop a recursion
    if text == u'':
        return []

    # important: do triple before double before single
    # so that 'dzs' does not become 'dz' + 's', or 'dz' to 'd' + 'z'
    if text[0:3].lower() in TRIPLE:
        return [text[0:3]] + convert_text_to_list(text[3:])
    if text[0:2].lower() in DOUBLE:
        return [text[0:2]] + convert_text_to_list(text[2:])
    return [text[0]] + convert_text_to_list(text[1:])


def split_to_words(text):
    return text.split()

	# coding: utf-8

	# In[1]:

	import os
	print os.getcwd()
	os.listdir('.')


	# In[2]:

	with open('data/private-cities.txt') as f:
	lines = f.readlines()


	# In[3]:

	# headlines 1
	[line for line in lines if len(line) < 80]


	# In[4]:

	# headlines 2
	[line for line in lines if '.' not in line]


	# In[5]:

	# beolvasas soronkent mashogy
	with open('data/private-cities.txt') as f:
	sections = [line for line in f if len(line) < 80]


	# In[6]:

	sections


	# In[7]:

	# karakterek szama
	with open('data/private-cities.txt') as f:
	print len(f.read())


	# In[8]:

	# sorok szama
	with open('data/private-cities.txt') as f:
	print len(f.readlines())


	# In[9]:

	# szavak szama
	with open('data/private-cities.txt') as f:
	words = f.read().split()

	print len(words)


	# In[10]:

	# leghosszabb szo hossza
	max_length = max(len(w) for w in words)
	print max_length


	# In[11]:

	# leghosszabb szo/szavak
	[word for word in words if len(word) == max_length]


	# In[12]:

	# jo az elozo megoldas?
	long_words4 = [word for word in words if len(word) == max_length - 4]
	long_words4


	# In[13]:

	# ismetlodesek!
	set(long_words4)


	# In[14]:

	# leggyakoribb szavak
	word_count = {}
	for word in words:
	if word in word_count:
	previous_count = word_count[word]
	else:
	previous_count = 0
	word_count[word] = previous_count + 1

	count_to_words = {}

	for word, count in word_count.items():
	if count in count_to_words:
	word_list = count_to_words[count]
	else:
	word_list = []
	count_to_words[count] = word_list
	word_list.append(word)

	highest_count = max(count_to_words)
	print highest_count, count_to_words[highest_count]
	high_counts = sorted(count_to_words, reverse=True)[:20]
	print high_counts
	for count in high_counts:
	print count, count_to_words[count]


	# In[15]:

	with open('data/private-cities.txt') as f:
	text = f.read()


	# In[16]:

	'!' in text


	# In[17]:

	'?' in text


	# In[18]:

	text.splitlines()[:5]


	# In[19]:

	print lines[5]


	# In[20]:

	# sentences
	def sentences(lines):
	separations=(
	('. ', '.'),
	('? ', '?'),
	)
	for separator, tail in separations:
	sentences = []
	for line in lines:
	sentences += split_by_separator(line, separator, tail)
	lines = sentences
	return lines

	def split_by_separator(line, separator, tail):
	fragments = line.split(separator)
	sentences = [
	s + tail
	for s in fragments[:-1]
	] + fragments[-1:]
	return sentences

	sentences(lines)[:20]


	## Homework

	# - Reformat the text, so that there are two spaces before sections, one after section start, and one between paragraphs.
	# - Report the number of sentences per section.
	# - Write sections into file named {two-digit-section-number}-{section-name}.txt e.g. 01-Private-Cities-101.txt
	tools
	ipython
	- live environment
	- completion on names
	- help on names

	ipython notebook
	- ipython in browser
	- later in course

	http://pythontutor.com/visualize.html#mode=edit


	dict (map, mapping)
	key -> value

	- keys are immutable - tuples vs lists
	- value - anything

	- create
	- empty: {}
	- {'a': 1, 1: 'a'}
	- dict(a=2, b=4)
	- dict(
	(
	(1, 2),
	(3, 4),
	(5, 6)
	)
	)
	- value access - normal indexing:
	d[key]
	- setting value:
	d[key] = value
	- deleting value:
	del d[key]
	- key existence check:
	key in d
	- get with default value:
	d.get(key, default_if_key_unknown)
	- all keys:
	d.keys()
	- iterate over keys:
	for key in d:
	d[key]
	# keys are NOT ordered:
	for key in dict(a=1, b=2, c=3, d=4): print key
	- can be thought of as a finite function


	set
	- values are immutable!

	.add
	.union
	.difference
	.intersection

	set vs dict

	sorting:
	- inplace
	list.sort()
	- new list
	sorted(iterable)

	File IO
	reading:
	f.read()
	f.readlines()
	for line in f:
	...

	writing:
	f.write(what)
	f.flush()


	Predefined files
	sys.stdin
	sys.stdout
	sys.stderr


	FileSystem
	os.listdir(dir) -> filenames

	open(filename, mode) -> file
	file.read() -> text
	file.readlines() -> [lines]
	file.write(what)
	file.close()

	file.close()

	with open() as f:
	f.read()

	with open() as f:
	f.write()

	os.remove()
	shutil.rmtree()


	??? where to get filenames ???
	- scripts embed in the source
	- tools
	- from command line
	sys.argv
	- known configuration file name
	from random import shuffle

	DOUBLE = (u'cs', u'dz', u'gy', u'ly', u'ny', u'sz', u'ty', u'zs')
	TRIPLE = (u'dzs',)


	def typoglicemia(text):
	words = split_to_words(text)
	typoglicemia_words = [word_typoglicemia(word) for word in words]
	return u' '.join(typoglicemia_words)


	def word_typoglicemia(word):
	characters = convert_text_to_list(word)
	typoglicemia_characters = list_typoglicemia(characters)
	return u''.join(typoglicemia_characters)


	def list_typoglicemia(characters):
	# shorter lists need not be reshuffled
	if len(characters) >= 4:
	middle = characters[1:-1]
	# 'shuffle' shuffles in place, does not return a value!
	shuffle(middle)
	return [characters[0]]+middle+[characters[-1]]
	else:
	return characters


	def convert_text_to_list(text):
	# never forget to stop a recursion
	if text == u'':
	return []

	# important: do triple before double before single
	# so that 'dzs' does not become 'dz' + 's', or 'dz' to 'd' + 'z'
	if text[0:3].lower() in TRIPLE:
	return [text[0:3]] + convert_text_to_list(text[3:])
	if text[0:2].lower() in DOUBLE:
	return [text[0:2]] + convert_text_to_list(text[2:])
	return [text[0]] + convert_text_to_list(text[1:])


	def split_to_words(text):
	return text.split()