Last active February 12, 2017 20:28
Python Snippets
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# converts csv to libsvm format
# usage: python (after having modified input and output filename)
import codecs
# change input filename here
filename = "features.csv"
fh =, "r", "utf-8")
lines = fh.readlines()
# change output filename here
out = "features.libsvm"
fh =, "w", "utf-8")
for l in lines:
l = l.split(";")
lineout = l[0] + " "
values = l[1:-2]
for i in range(len(values)):
# for some unknown reason libsvm starts index counting at 1, not at 0
lineout += str(i+1) + ":" + values[i] + " "
#lineout += " -1:\n"
lineout += "\n"
#download the free audio trainer German / French from Deutsche Welle
import urllib
for i in range(1, 10):
urllib.urlretrieve('' +str(i)+'_dwdownload.mp3', '/home/me/filename_' + str(i) + '.mp3')
for i in range(10, 100):
urllib.urlretrieve('' +str(i)+'_dwdownload.mp3', '/home/me/filename_' + str(i) + '.mp3')
urllib.urlretrieve('', '/home/me/filename_100.mp3')
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
import urllib
# performs a Google search with the title and returns the first URL's of the query result
def search_web(title):
words = "+".join(title.split())
query = urllib.urlencode({'q':words})
# the ajax google api only returns the first results but that is enough for our purpose
response = urllib.urlopen('' + query ).read()
json_response = json.loads(response)
if json_response and json_response['responseData']:
results = json_response['responseData']['results']
return []
urls = [result['url'] for result in results]
print('Found ' + str(len(urls)) + ' URLs.')
return urls
{'Estonian': 'et', 'Bicol': 'bcl', 'Zande': 'zne', 'Uruund': 'rnd', 'Khoekhoegowab': 'naq', u'Newari': 'new', 'Krio': 'kri', 'Nuer': 'nus', 'Cinyanja': 'nya', u'Bulgarian': 'bg', 'Norwegian': 'no', u'Yoruba': 'yo', u'French': 'fr', u'Otomi': 'oto', 'Ateso': 'teo', 'Iloko': 'ilo', 'Wolaita': 'wal', 'Tsonga': 'ts', 'Tzotzil': 'tzo', u'Tamil': 'ta', u'Haitian': 'ht', 'Samoan': 'sm', 'Aukan': 'djk', 'Finnish': 'fi', 'Rutoro': 'ttj', 'Dangme': 'ada', 'Albanian': 'sq', 'Mbunda': 'mck', 'Solomon': 'pis', 'Hiligaynon': 'hil', 'Tagalog': 'tl', u'Serbian': 'sr_latn', u'Efik': 'efi', 'Pangasinan': 'pag', 'Italian': 'it', 'Miskito': 'miq', 'Lhukonzo': 'koo', 'Lamba': 'lam', u'Kongo': 'kg', 'Mazatec': 'mau', u'Tarascan': 'tsz', u'Amharic': 'am', u'Czech': 'cs', u'Papiamento': 'pap', u'Nahuatl': 'ncj', 'Ga': 'gaa', 'Polish': 'pl', 'Tongan': 'to', 'Xhosa': 'xh', 'Swedish': 'sv', u'Marathi': 'mr', 'Luganda': 'lg', u'Slovenian': 'sl', 'Ewe': 'ee', u'Azerbaijani': 'az_cyrl', u'Kikuyu': 'ki', 'Luo': 'luo', 'Tankarana': 'xmv', 'Danish': 'da', 'Indonesian': 'id', 'Frafra': 'gur', 'Zulu': 'zu', 'Lenje': 'leh', 'Cakchiquel': 'cak', u'Georgian': 'ka', 'Mayangna': 'yan', 'Tetum': 'tdt', u'Tigrinya': 'ti', 'Nzema': 'nzi', 'Niuean': 'niu', u'Slovak': 'sk', u'Thai': 'th', 'Afrikaans': 'af', u'Lahu': 'lhu', u'Guarani': 'gug', 'Sidama': 'sid', u'Punjabi': 'pa', 'Kalenjin': 'kln', 'Herero': 'hz', u'Kekchi': 'kek', 'Kisonge': 'sop', u'Latvian': 'lv', 'English': 'en', 'Mambwe-Lungu': 'mgr', 'Lingala': 'ln', u'Faeroese': 'fo', u'Chinese': 'zh_hant', 'Wayuunaiki': 'guc', 'Quichua': 'qus', 'Huave': 'huv', u'Tatar': 'tt', 'Kabyle': 'kab', 'Chin': 'cnh', u'Quiche': 'quc', 'Rapa': 'rap', 'Venda': 've', 'Tojolabal': 'toj', 'Swahili': 'sw', u'Icelandic': 'is', u'Turkish': 'tr', 'Kalanga': 'kck', 'Twi': 'tw', 'Waray-Waray': 'war', u'Kirghiz': 'ky', 'Guna': 'cuk', u'Gujarati': 'gu', u'Hindi': 'hi', 'Zapotec': 'zpg', u'Korean': 'ko', 'Malagasy': 'mg', 'Hungarian': 'hu', 'Igbo': 'ig', u'Lithuanian': 'lt', 'Greenlandic': 'kl', 'Tzeltal': 'tzh', 'Acholi': 'ach', u'Russian': 'ru', 'Romany': 'rmn', 'Croatian': 'hr', u'Kazakh': 'kk_cyrl', 'Tiv': 'tiv', 'Cebuano': 'ceb', u'Armenian': 'hy_armn', 'Sarnami': 'hns', 'Kikamba': 'kam', 'Toba': 'tob', 'Chol': 'ctu', 'Luvale': 'lue', 'Sepedi': 'nso', 'Mixe': 'mco', u'Greek': 'el', 'Sesotho': 'st', 'Hausa': 'ha', 'Isoko': 'iso', 'Irish': 'ga', 'Seychelles': 'crs', 'German': 'de', 'Runyankore': 'nyn', 'Kwanyama': 'kj', u'Macedonian': 'mk', u'Mongolian': 'mn', 'Aymara': 'ay', u'Mapudungun': 'arn', u'Sinhala': 'si', 'Ndonga': 'ng', u'Vietnamese': 'vi', u'Romanian': 'ro', 'Shona': 'sn', 'Dutch': 'nl', 'Swati': 'ss', 'Somali': 'so', 'Garifuna': 'cab', u'Nepali': 'ne', 'Tokelauan': 'tkl', 'Maya': 'yua', u'Ukrainian': 'uk', 'Welsh': 'cy', u'Mauritian': 'mfe', u'Mayo': 'mfy', 'Kisi': 'kiz', 'Tahitian': 'ty', u'Baoule': 'bci', u'Pilag\xe1': 'plg', 'Rarotongan': 'rar', 'Maltese': 'mt', 'Mam': 'mam', u'Cambodian': 'km', u'Kurdish': 'kmr_cyrl', u'Spanish': 'es', 'Tswana': 'tn', 'Kikaonde': 'kqn', 'Sango': 'sg', 'Oromo': 'om', u'Portuguese': 'pt', u'Huastec': 'hus', u'Myanmar': 'mya', u'Saramaccan': 'srm', 'Sranantongo': 'srn', 'Kiluba': 'lub', u'Japanese': 'ja', 'Kinyarwanda': 'rw', 'Lugbara': 'lgg', 'Ndebele': 'nr', 'Quechua': 'que', 'Kwangali': 'kwn', u'Tajiki': 'tg', u'Ossetian': 'os'}
# loads the html content of an URL, extracts the text and returns it as unicode string
import urllib2
from bs4 import BeautifulSoup
import nltk
# loads url, extracts and returns text as unicode string
def get_text_from_url(url):
opener = urllib2.build_opener()
request = urllib2.Request(url)
request.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:19.0) Gecko/20100101 Firefox/19.0')
html =
# pass to BeautifulSoup before because it will determine the character encoding of the website and decode to Unicode
soup = BeautifulSoup(html)
html_unicode = unicode(soup)
text_unicode = nltk.clean_html(html_unicode)
return text_unicode
#! /usr/bin/python
# coding: utf-8 -*-
import sys
import time
import string
import logging
import urllib
import requests
import codecs
from bs4 import BeautifulSoup
# schema of URLs:
# replace this dict and adapt baseurl to receive texts in another language
biblebooks = {"1. Korinther": 46, "Habakuk": 35, "Epheser": 49, "2. Timotheus": 55, "Haggai": 37, "1. Samuel": 9, "Johannes": 43, "Jona": 32, "Daniel": 27, "Zephanja": 36, "1. Petrus": 60, "2. Chronika": 14, "Ruth": 8, "Judas": 65, "1. Mose": 1, "Esther": 17, "Jakobus": 59, "Maleachi": 39, "1. Johannes": 62, "Klagelieder": 25, "2. Mose": 2, "Kolosser": 51, "2. Korinther": 47, "1. Könige": 11, "Prediger": 21, "Micha": 33, "Philipper": 50, "Galater": 48, "Josua": 6, "Markus": 41, "Joel": 29, "Lukas": 42, "Hohes Lied": 22, "Jeremia": 24, "Hosea": 28, "Hiob": 18, "1. Timotheus": 54, "Psalm": 19, "2. Thessalonicher": 53, "Nehemia": 16, "5. Mose": 5, "Amos": 30, "Obadja": 31, "Apostelgeschichte": 44, "1. Chronika": 13, "Richter": 7, "4. Mose": 4, "Nahum": 34, "Matthäus": 40, "Römer": 45, "Sprüche": 20, "3. Johannes": 64, "Jesaja": 23, "Hesekiel": 26, "Hebräer": 58, "Sacharja": 38, "Titus": 56, "Philemon": 57, "Esra": 15, "Offenbarung": 66, "2. Könige": 12, "3. Mose": 3, "2. Johannes": 63, "2. Samuel": 10, "2. Petrus": 61, "1. Thessalonicher": 52}
one_chapter_books = ['Obadja', 'Philemon', 'Judas', '2. Johannes', '3. Johannes']
for k in biblebooks.keys():
biblebooks[unicode(k.decode('utf-8'))] = biblebooks.pop(k)
# for German bible texts only
baseurl = ''
"""Takes as input the text's "address" as string or unicode and returns the text's content as unicode, downloaded from
the German New World Translation published by JW on
Example call: nwt_textfinder.get_bible_text_german(u'Sprüche 18:10')
Returns: u'10\xa0\xa0Der Name Jehovas ist ein starker Turm.+ Der Gerechte l\xe4uft hinein und wird besch\xfctzt.*+\n\n'
Also downloads multiple texts like "1. Mose 1:1, 2" or "1. Mose 1:1-3"
def get_bible_text_german(text):
text = unicode(text.strip())
text = text.replace(u'\xa0', u' ')
while not text[-1].isdigit():
text = text[:-1]
logging.debug('Now starting to look up this text: %s' %text)
if ':' in text:
book_chapter, verse = text.split(':', 1)
book, chapter = book_chapter.rsplit(' ', 1)
except ValueError:
logging.error('No valid bible text - there must be a space between book and chapter: %s' %text)
return ''
book_found = False
for b in one_chapter_books:
if b in text:
logging.warning('This is a text from a book with one chapter: %s' %text)
book, verse = text.rsplit(' ', 1)
except ValueError:
logging.error('No valid bible text - there must be a space between book and verse: %s' %text)
chapter = '1'
book_found = True
if not book_found:
logging.error('No valid text! There must be a colon between the chapter and the verse: %s' %text)
return ''
logging.debug('Book: %s' %book)
logging.debug('Chapter: %s' %chapter)
logging.debug('Verse: %s' %verse)
if '-' in verse:
start_verse, end_verse = verse.split('-')
verselist = range(int(start_verse.strip()), int(end_verse.strip())+1)
verselist = [str(v) for v in verselist]
elif ',' in verse:
a, b = verse.split(',')
verselist = [a.strip(), b.strip()]
verselist = [verse.replace(';','').replace(',','')]
verselist = [v for v in verselist if v]
logging.debug('Will now look up these verse(s): %s' %str(verselist))
book = book.replace(u'\xa0', ' ')
texturl = book.replace('. ', '-') + '/' + chapter + '/'
r = requests.get(baseurl + texturl)
soup = BeautifulSoup(r.content)
bookid = str(biblebooks[book])
except KeyError as err:
logging.error('Book %s was not found! %s' %(book, err))
result = ''
for v in verselist:
textid = 'v' + bookid + (3 - len(chapter)) * '0' + chapter + (3 - len(v)) * '0' + v
logging.debug('ID of the texts element: %s' %textid)
souptext = soup.find(id=textid)
if souptext:
result += souptext.text + '\n'
logging.error('No text found for this verse: %s' %v)
return result
# various Python code snippets
#!/usr/bin/env python
# scrape Wikipedia table and export as .csv file
import urllib, urllib2
import unicodecsv as csv
import json
import codecs
from BeautifulSoup import BeautifulSoup
def main(url, table_names):
all_data = list()
fh = urllib.urlopen(url)
content =
soup = BeautifulSoup(content)
tables = soup.findAll('table', 'wikitable') #excludes vertical navigation box with article summary
for table_no, table in enumerate(tables[:-1]):
if table_no == 0:
heads = [th.text for th in table.findAll('th')]
all_data.append(heads + [u'Continent'])
for row in table.findAll('tr'):
row_data = []
if row.findAll('th'):
for table_data in row.findAll('td'):
if table_data.a: row_data.append(table_data.a.text)
else: row_data.append(table_data.text)
# csv module cannot handle unicode (shame on you)
all_data_utf8 = []
for line in all_data:
all_data_utf8.append([unicode(s).encode('utf-8') for s in line])
out = open('all_data.csv', 'w')
writer = csv.writer(out, dialect='excel', encoding='utf-8')
for row in all_data_utf8:
print row
if __name__ == '__main__':
url = r"'s_Witnesses_by_country"
table_names = [u'Africa', u'North America', u'Caribbean', u'South America', u'Asia', u'Europe', u'Oceania', u'Other']
main(url, table_names)
'''Send mobi files to kindle email address with python's email package'''
def send2kindle(f):
msg = MIMEMultipart()
msg['Subject'] = "Neue Zeitschriftenlieferung!"
msg['From'] =
msg['To'] = sender_email
msg['Date'] = formatdate(localtime=True)
message = "Neue Zeitschriften im Anhang"
part = MIMEApplication(open(f, 'rb').read(), _subtype='application/x-mobipocket-ebook')
part.add_header('Content-Disposition', 'attachment; filename="%s"' % os.path.basename(f))
# connect to Google's SMTP server
mailserver = smtplib.SMTP(smtpserver, port)
mailserver.login(sender_email, sender_password)
mailserver.sendmail(sender_email, kindles_email, msg.as_string())
# choose a key from a dictionary randomly - the probability of a key getting chosen is commensurate to its value
# d ist dictionary mit index - count werten
def simpleProbDist(d):
i = 1
d2 = dict()
for k in d.keys():
r = (i, i+d[k])
#print r
i += d[k]
#print k
d2[k] = r
x = random.randint(1, i-1)
#print "x: " + str(x)
for k in d2.keys():
if x in range(*d2[k]):
chosen = k
return chosen
#remove punctuation:
import string
table = string.maketrans("","")
def rem_punct(s):
return s.translate(table, string.punctuation)
#lemmatizing of nouns and verbs with the Nodebox Linguistics Library und WordNet
#perform POS tagging before to improve precision
def lemmatize(words):
lemmas = []
for w in words:
if en.is_verb(w):
elif en.is_noun(w):
return lemmas
# hopefully i won't forget them anymore when i wrote them down here :-)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
if __name__ == '__main__':
# only Python 2, not 3
except IOerror as e:
print >> stderr, "IOError: %s" %e.strerror
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
# execute bash command
subprocess.Popen(['/bin/bash', '-c', cmd])
# get output of bash command in python
output = subprocess.check_output(['/bin/bash', '-c', cmd])
# get stderr
output = subprocess.check_output('ls', stderr=subprocess.STDOUT)
Pylab / Numpy etc.
data = np.genfromtxt('all_absolute.csv', delimiter=';')
data = np.array(1, 2, 3], [4, 5, 6)
plt.imshow(data4, interpolation='nearest',
import wikipedia #
import urllib
import re
msg = 'Error!'
p =
# if you know only the page_title and url but not the pageid you have to
# find it in the html code of the wikipedia page before you can use
# the API functions
except wikipedia.DisambiguationError as err:
print err
fh = urllib.urlopen(baseurl + page_link)
if fh:
html =
r ='"wgArticleId":(\d+),', html)
groups = r.groups()
if groups:
pageid = groups[0]
p =
print msg
print msg
import codecs
import string
import re
import logging
import nltk
import gensim
import en
# before starting the script install Cython with `pip install cython` to use optimized word2vec training (70x speedup).
# Creates and saves a word2vec model from a big (some MB) file of raw text.
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
global table
table = string.maketrans("","")
def remove_punct(s):
return s.translate(table, string.punctuation)
def lemmatize(words):
lemmas = []
for w in words:
if en.is_verb(w):
elif en.is_noun(w):
return lemmas
class WordPreparer(object):
def __init__(self, fname):
self.fname = fname
fh =, mode='r', encoding='utf-8')
text =
self.sents = nltk.sent_tokenize(text)
self.sents = [remove_punct(s.encode('utf-8')) for s in self.sents]
def __iter__(self):
for s in self.sents:
yield lemmatize(remove_punct(s).split())
if __name__ == '__main__':
words = WordPreparer('raw_text.txt')
model = gensim.models.Word2Vec(words, min_count=3, workers=2)'word2vec_model')
