Last active
August 29, 2015 14:17
-
-
Save e3krisztian/458e486d5d937246906e to your computer and use it in GitHub Desktop.
amc6
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# In[1]: | |
import os | |
print os.getcwd() | |
os.listdir('.') | |
# In[2]: | |
with open('data/private-cities.txt') as f: | |
lines = f.readlines() | |
# In[3]: | |
# headlines 1 | |
[line for line in lines if len(line) < 80] | |
# In[4]: | |
# headlines 2 | |
[line for line in lines if '.' not in line] | |
# In[5]: | |
# beolvasas soronkent mashogy | |
with open('data/private-cities.txt') as f: | |
sections = [line for line in f if len(line) < 80] | |
# In[6]: | |
sections | |
# In[7]: | |
# karakterek szama | |
with open('data/private-cities.txt') as f: | |
print len(f.read()) | |
# In[8]: | |
# sorok szama | |
with open('data/private-cities.txt') as f: | |
print len(f.readlines()) | |
# In[9]: | |
# szavak szama | |
with open('data/private-cities.txt') as f: | |
words = f.read().split() | |
print len(words) | |
# In[10]: | |
# leghosszabb szo hossza | |
max_length = max(len(w) for w in words) | |
print max_length | |
# In[11]: | |
# leghosszabb szo/szavak | |
[word for word in words if len(word) == max_length] | |
# In[12]: | |
# jo az elozo megoldas? | |
long_words4 = [word for word in words if len(word) == max_length - 4] | |
long_words4 | |
# In[13]: | |
# ismetlodesek! | |
set(long_words4) | |
# In[14]: | |
# leggyakoribb szavak | |
word_count = {} | |
for word in words: | |
if word in word_count: | |
previous_count = word_count[word] | |
else: | |
previous_count = 0 | |
word_count[word] = previous_count + 1 | |
count_to_words = {} | |
for word, count in word_count.items(): | |
if count in count_to_words: | |
word_list = count_to_words[count] | |
else: | |
word_list = [] | |
count_to_words[count] = word_list | |
word_list.append(word) | |
highest_count = max(count_to_words) | |
print highest_count, count_to_words[highest_count] | |
high_counts = sorted(count_to_words, reverse=True)[:20] | |
print high_counts | |
for count in high_counts: | |
print count, count_to_words[count] | |
# In[15]: | |
with open('data/private-cities.txt') as f: | |
text = f.read() | |
# In[16]: | |
'!' in text | |
# In[17]: | |
'?' in text | |
# In[18]: | |
text.splitlines()[:5] | |
# In[19]: | |
print lines[5] | |
# In[20]: | |
# sentences | |
def sentences(lines): | |
separations=( | |
('. ', '.'), | |
('? ', '?'), | |
) | |
for separator, tail in separations: | |
sentences = [] | |
for line in lines: | |
sentences += split_by_separator(line, separator, tail) | |
lines = sentences | |
return lines | |
def split_by_separator(line, separator, tail): | |
fragments = line.split(separator) | |
sentences = [ | |
s + tail | |
for s in fragments[:-1] | |
] + fragments[-1:] | |
return sentences | |
sentences(lines)[:20] | |
## Homework | |
# - Reformat the text, so that there are two spaces before sections, one after section start, and one between paragraphs. | |
# - Report the number of sentences per section. | |
# - Write sections into file named *{two-digit-section-number}-{section-name}.txt* e.g. *01-Private-Cities-101.txt* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
tools | |
ipython | |
- live environment | |
- completion on names | |
- help on names | |
ipython notebook | |
- ipython in browser | |
- later in course | |
http://pythontutor.com/visualize.html#mode=edit | |
dict (map, mapping) | |
key -> value | |
- keys are immutable - tuples vs lists | |
- value - anything | |
- create | |
- empty: {} | |
- {'a': 1, 1: 'a'} | |
- dict(a=2, b=4) | |
- dict( | |
( | |
(1, 2), | |
(3, 4), | |
(5, 6) | |
) | |
) | |
- value access - normal indexing: | |
d[key] | |
- setting value: | |
d[key] = value | |
- deleting value: | |
del d[key] | |
- key existence check: | |
key in d | |
- get with default value: | |
d.get(key, default_if_key_unknown) | |
- all keys: | |
d.keys() | |
- iterate over keys: | |
for key in d: | |
d[key] | |
# keys are NOT ordered: | |
for key in dict(a=1, b=2, c=3, d=4): print key | |
- can be thought of as a finite function | |
set | |
- values are immutable! | |
.add | |
.union | |
.difference | |
.intersection | |
set vs dict | |
sorting: | |
- inplace | |
list.sort() | |
- new list | |
sorted(iterable) | |
File IO | |
reading: | |
f.read() | |
f.readlines() | |
for line in f: | |
... | |
writing: | |
f.write(what) | |
f.flush() | |
Predefined files | |
sys.stdin | |
sys.stdout | |
sys.stderr | |
FileSystem | |
os.listdir(dir) -> filenames | |
open(filename, mode) -> file | |
file.read() -> text | |
file.readlines() -> [lines] | |
file.write(what) | |
file.close() | |
file.close() | |
with open() as f: | |
f.read() | |
with open() as f: | |
f.write() | |
os.remove() | |
shutil.rmtree() | |
??? where to get filenames ??? | |
- scripts embed in the source | |
- tools | |
- from command line | |
sys.argv | |
- known configuration file name |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from random import shuffle | |
DOUBLE = (u'cs', u'dz', u'gy', u'ly', u'ny', u'sz', u'ty', u'zs') | |
TRIPLE = (u'dzs',) | |
def typoglicemia(text): | |
words = split_to_words(text) | |
typoglicemia_words = [word_typoglicemia(word) for word in words] | |
return u' '.join(typoglicemia_words) | |
def word_typoglicemia(word): | |
characters = convert_text_to_list(word) | |
typoglicemia_characters = list_typoglicemia(characters) | |
return u''.join(typoglicemia_characters) | |
def list_typoglicemia(characters): | |
# shorter lists need not be reshuffled | |
if len(characters) >= 4: | |
middle = characters[1:-1] | |
# 'shuffle' shuffles in place, does not return a value! | |
shuffle(middle) | |
return [characters[0]]+middle+[characters[-1]] | |
else: | |
return characters | |
def convert_text_to_list(text): | |
# never forget to stop a recursion | |
if text == u'': | |
return [] | |
# important: do triple before double before single | |
# so that 'dzs' does not become 'dz' + 's', or 'dz' to 'd' + 'z' | |
if text[0:3].lower() in TRIPLE: | |
return [text[0:3]] + convert_text_to_list(text[3:]) | |
if text[0:2].lower() in DOUBLE: | |
return [text[0:2]] + convert_text_to_list(text[2:]) | |
return [text[0]] + convert_text_to_list(text[1:]) | |
def split_to_words(text): | |
return text.split() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment