Skip to content

Instantly share code, notes, and snippets.

@e3krisztian
Last active August 29, 2015 14:17
Show Gist options
  • Save e3krisztian/458e486d5d937246906e to your computer and use it in GitHub Desktop.
Save e3krisztian/458e486d5d937246906e to your computer and use it in GitHub Desktop.
amc6
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
# coding: utf-8
# In[1]:
import os
print os.getcwd()
os.listdir('.')
# In[2]:
with open('data/private-cities.txt') as f:
lines = f.readlines()
# In[3]:
# headlines 1
[line for line in lines if len(line) < 80]
# In[4]:
# headlines 2
[line for line in lines if '.' not in line]
# In[5]:
# beolvasas soronkent mashogy
with open('data/private-cities.txt') as f:
sections = [line for line in f if len(line) < 80]
# In[6]:
sections
# In[7]:
# karakterek szama
with open('data/private-cities.txt') as f:
print len(f.read())
# In[8]:
# sorok szama
with open('data/private-cities.txt') as f:
print len(f.readlines())
# In[9]:
# szavak szama
with open('data/private-cities.txt') as f:
words = f.read().split()
print len(words)
# In[10]:
# leghosszabb szo hossza
max_length = max(len(w) for w in words)
print max_length
# In[11]:
# leghosszabb szo/szavak
[word for word in words if len(word) == max_length]
# In[12]:
# jo az elozo megoldas?
long_words4 = [word for word in words if len(word) == max_length - 4]
long_words4
# In[13]:
# ismetlodesek!
set(long_words4)
# In[14]:
# leggyakoribb szavak
word_count = {}
for word in words:
if word in word_count:
previous_count = word_count[word]
else:
previous_count = 0
word_count[word] = previous_count + 1
count_to_words = {}
for word, count in word_count.items():
if count in count_to_words:
word_list = count_to_words[count]
else:
word_list = []
count_to_words[count] = word_list
word_list.append(word)
highest_count = max(count_to_words)
print highest_count, count_to_words[highest_count]
high_counts = sorted(count_to_words, reverse=True)[:20]
print high_counts
for count in high_counts:
print count, count_to_words[count]
# In[15]:
with open('data/private-cities.txt') as f:
text = f.read()
# In[16]:
'!' in text
# In[17]:
'?' in text
# In[18]:
text.splitlines()[:5]
# In[19]:
print lines[5]
# In[20]:
# sentences
def sentences(lines):
separations=(
('. ', '.'),
('? ', '?'),
)
for separator, tail in separations:
sentences = []
for line in lines:
sentences += split_by_separator(line, separator, tail)
lines = sentences
return lines
def split_by_separator(line, separator, tail):
fragments = line.split(separator)
sentences = [
s + tail
for s in fragments[:-1]
] + fragments[-1:]
return sentences
sentences(lines)[:20]
## Homework
# - Reformat the text, so that there are two spaces before sections, one after section start, and one between paragraphs.
# - Report the number of sentences per section.
# - Write sections into file named *{two-digit-section-number}-{section-name}.txt* e.g. *01-Private-Cities-101.txt*
tools
ipython
- live environment
- completion on names
- help on names
ipython notebook
- ipython in browser
- later in course
http://pythontutor.com/visualize.html#mode=edit
dict (map, mapping)
key -> value
- keys are immutable - tuples vs lists
- value - anything
- create
- empty: {}
- {'a': 1, 1: 'a'}
- dict(a=2, b=4)
- dict(
(
(1, 2),
(3, 4),
(5, 6)
)
)
- value access - normal indexing:
d[key]
- setting value:
d[key] = value
- deleting value:
del d[key]
- key existence check:
key in d
- get with default value:
d.get(key, default_if_key_unknown)
- all keys:
d.keys()
- iterate over keys:
for key in d:
d[key]
# keys are NOT ordered:
for key in dict(a=1, b=2, c=3, d=4): print key
- can be thought of as a finite function
set
- values are immutable!
.add
.union
.difference
.intersection
set vs dict
sorting:
- inplace
list.sort()
- new list
sorted(iterable)
File IO
reading:
f.read()
f.readlines()
for line in f:
...
writing:
f.write(what)
f.flush()
Predefined files
sys.stdin
sys.stdout
sys.stderr
FileSystem
os.listdir(dir) -> filenames
open(filename, mode) -> file
file.read() -> text
file.readlines() -> [lines]
file.write(what)
file.close()
file.close()
with open() as f:
f.read()
with open() as f:
f.write()
os.remove()
shutil.rmtree()
??? where to get filenames ???
- scripts embed in the source
- tools
- from command line
sys.argv
- known configuration file name
from random import shuffle
DOUBLE = (u'cs', u'dz', u'gy', u'ly', u'ny', u'sz', u'ty', u'zs')
TRIPLE = (u'dzs',)
def typoglicemia(text):
words = split_to_words(text)
typoglicemia_words = [word_typoglicemia(word) for word in words]
return u' '.join(typoglicemia_words)
def word_typoglicemia(word):
characters = convert_text_to_list(word)
typoglicemia_characters = list_typoglicemia(characters)
return u''.join(typoglicemia_characters)
def list_typoglicemia(characters):
# shorter lists need not be reshuffled
if len(characters) >= 4:
middle = characters[1:-1]
# 'shuffle' shuffles in place, does not return a value!
shuffle(middle)
return [characters[0]]+middle+[characters[-1]]
else:
return characters
def convert_text_to_list(text):
# never forget to stop a recursion
if text == u'':
return []
# important: do triple before double before single
# so that 'dzs' does not become 'dz' + 's', or 'dz' to 'd' + 'z'
if text[0:3].lower() in TRIPLE:
return [text[0:3]] + convert_text_to_list(text[3:])
if text[0:2].lower() in DOUBLE:
return [text[0:2]] + convert_text_to_list(text[2:])
return [text[0]] + convert_text_to_list(text[1:])
def split_to_words(text):
return text.split()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment