Skip to content

Instantly share code, notes, and snippets.

@SuzanaK
Last active February 12, 2017 20:28
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save SuzanaK/5245543 to your computer and use it in GitHub Desktop.
Save SuzanaK/5245543 to your computer and use it in GitHub Desktop.
Python Snippets
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# converts csv to libsvm format
# usage: python convert_to_libsvm.py (after having modified input and output filename)
import codecs
# change input filename here
filename = "features.csv"
fh = codecs.open(filename, "r", "utf-8")
lines = fh.readlines()
fh.close()
# change output filename here
out = "features.libsvm"
fh = codecs.open(out, "w", "utf-8")
for l in lines:
l = l.split(";")
lineout = l[0] + " "
values = l[1:-2]
for i in range(len(values)):
# for some unknown reason libsvm starts index counting at 1, not at 0
lineout += str(i+1) + ":" + values[i] + " "
#lineout += " -1:\n"
lineout += "\n"
fh.write(lineout)
fh.close()
#download the free audio trainer German / French from Deutsche Welle
import urllib
for i in range(1, 10):
urllib.urlretrieve('http://radio-download.dw.de/Events/dwelle/deutschkurse/audiotrainer/fra/Audiotrainer_Franzoesisch_Lektion00' +str(i)+'_dwdownload.mp3', '/home/me/filename_' + str(i) + '.mp3')
for i in range(10, 100):
urllib.urlretrieve('http://radio-download.dw.de/Events/dwelle/deutschkurse/audiotrainer/fra/Audiotrainer_Franzoesisch_Lektion0' +str(i)+'_dwdownload.mp3', '/home/me/filename_' + str(i) + '.mp3')
urllib.urlretrieve('http://radio-download.dw.de/Events/dwelle/deutschkurse/audiotrainer/fra/Audiotrainer_Franzoesisch_Lektion100_dwdownload.mp3', '/home/me/filename_100.mp3')
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
import urllib
# performs a Google search with the title and returns the first URL's of the query result
def search_web(title):
words = "+".join(title.split())
query = urllib.urlencode({'q':words})
# the ajax google api only returns the first results but that is enough for our purpose
response = urllib.urlopen('http://ajax.googleapis.com/ajax/services/search/web?v=1.0&' + query ).read()
json_response = json.loads(response)
if json_response and json_response['responseData']:
results = json_response['responseData']['results']
else:
return []
urls = [result['url'] for result in results]
print('Found ' + str(len(urls)) + ' URLs.')
return urls
{'Estonian': 'et', 'Bicol': 'bcl', 'Zande': 'zne', 'Uruund': 'rnd', 'Khoekhoegowab': 'naq', u'Newari': 'new', 'Krio': 'kri', 'Nuer': 'nus', 'Cinyanja': 'nya', u'Bulgarian': 'bg', 'Norwegian': 'no', u'Yoruba': 'yo', u'French': 'fr', u'Otomi': 'oto', 'Ateso': 'teo', 'Iloko': 'ilo', 'Wolaita': 'wal', 'Tsonga': 'ts', 'Tzotzil': 'tzo', u'Tamil': 'ta', u'Haitian': 'ht', 'Samoan': 'sm', 'Aukan': 'djk', 'Finnish': 'fi', 'Rutoro': 'ttj', 'Dangme': 'ada', 'Albanian': 'sq', 'Mbunda': 'mck', 'Solomon': 'pis', 'Hiligaynon': 'hil', 'Tagalog': 'tl', u'Serbian': 'sr_latn', u'Efik': 'efi', 'Pangasinan': 'pag', 'Italian': 'it', 'Miskito': 'miq', 'Lhukonzo': 'koo', 'Lamba': 'lam', u'Kongo': 'kg', 'Mazatec': 'mau', u'Tarascan': 'tsz', u'Amharic': 'am', u'Czech': 'cs', u'Papiamento': 'pap', u'Nahuatl': 'ncj', 'Ga': 'gaa', 'Polish': 'pl', 'Tongan': 'to', 'Xhosa': 'xh', 'Swedish': 'sv', u'Marathi': 'mr', 'Luganda': 'lg', u'Slovenian': 'sl', 'Ewe': 'ee', u'Azerbaijani': 'az_cyrl', u'Kikuyu': 'ki', 'Luo': 'luo', 'Tankarana': 'xmv', 'Danish': 'da', 'Indonesian': 'id', 'Frafra': 'gur', 'Zulu': 'zu', 'Lenje': 'leh', 'Cakchiquel': 'cak', u'Georgian': 'ka', 'Mayangna': 'yan', 'Tetum': 'tdt', u'Tigrinya': 'ti', 'Nzema': 'nzi', 'Niuean': 'niu', u'Slovak': 'sk', u'Thai': 'th', 'Afrikaans': 'af', u'Lahu': 'lhu', u'Guarani': 'gug', 'Sidama': 'sid', u'Punjabi': 'pa', 'Kalenjin': 'kln', 'Herero': 'hz', u'Kekchi': 'kek', 'Kisonge': 'sop', u'Latvian': 'lv', 'English': 'en', 'Mambwe-Lungu': 'mgr', 'Lingala': 'ln', u'Faeroese': 'fo', u'Chinese': 'zh_hant', 'Wayuunaiki': 'guc', 'Quichua': 'qus', 'Huave': 'huv', u'Tatar': 'tt', 'Kabyle': 'kab', 'Chin': 'cnh', u'Quiche': 'quc', 'Rapa': 'rap', 'Venda': 've', 'Tojolabal': 'toj', 'Swahili': 'sw', u'Icelandic': 'is', u'Turkish': 'tr', 'Kalanga': 'kck', 'Twi': 'tw', 'Waray-Waray': 'war', u'Kirghiz': 'ky', 'Guna': 'cuk', u'Gujarati': 'gu', u'Hindi': 'hi', 'Zapotec': 'zpg', u'Korean': 'ko', 'Malagasy': 'mg', 'Hungarian': 'hu', 'Igbo': 'ig', u'Lithuanian': 'lt', 'Greenlandic': 'kl', 'Tzeltal': 'tzh', 'Acholi': 'ach', u'Russian': 'ru', 'Romany': 'rmn', 'Croatian': 'hr', u'Kazakh': 'kk_cyrl', 'Tiv': 'tiv', 'Cebuano': 'ceb', u'Armenian': 'hy_armn', 'Sarnami': 'hns', 'Kikamba': 'kam', 'Toba': 'tob', 'Chol': 'ctu', 'Luvale': 'lue', 'Sepedi': 'nso', 'Mixe': 'mco', u'Greek': 'el', 'Sesotho': 'st', 'Hausa': 'ha', 'Isoko': 'iso', 'Irish': 'ga', 'Seychelles': 'crs', 'German': 'de', 'Runyankore': 'nyn', 'Kwanyama': 'kj', u'Macedonian': 'mk', u'Mongolian': 'mn', 'Aymara': 'ay', u'Mapudungun': 'arn', u'Sinhala': 'si', 'Ndonga': 'ng', u'Vietnamese': 'vi', u'Romanian': 'ro', 'Shona': 'sn', 'Dutch': 'nl', 'Swati': 'ss', 'Somali': 'so', 'Garifuna': 'cab', u'Nepali': 'ne', 'Tokelauan': 'tkl', 'Maya': 'yua', u'Ukrainian': 'uk', 'Welsh': 'cy', u'Mauritian': 'mfe', u'Mayo': 'mfy', 'Kisi': 'kiz', 'Tahitian': 'ty', u'Baoule': 'bci', u'Pilag\xe1': 'plg', 'Rarotongan': 'rar', 'Maltese': 'mt', 'Mam': 'mam', u'Cambodian': 'km', u'Kurdish': 'kmr_cyrl', u'Spanish': 'es', 'Tswana': 'tn', 'Kikaonde': 'kqn', 'Sango': 'sg', 'Oromo': 'om', u'Portuguese': 'pt', u'Huastec': 'hus', u'Myanmar': 'mya', u'Saramaccan': 'srm', 'Sranantongo': 'srn', 'Kiluba': 'lub', u'Japanese': 'ja', 'Kinyarwanda': 'rw', 'Lugbara': 'lgg', 'Ndebele': 'nr', 'Quechua': 'que', 'Kwangali': 'kwn', u'Tajiki': 'tg', u'Ossetian': 'os'}
# loads the html content of an URL, extracts the text and returns it as unicode string
import urllib2
from bs4 import BeautifulSoup
import nltk
# loads url, extracts and returns text as unicode string
def get_text_from_url(url):
opener = urllib2.build_opener()
request = urllib2.Request(url)
request.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:19.0) Gecko/20100101 Firefox/19.0')
html = opener.open(request).read()
# pass to BeautifulSoup before because it will determine the character encoding of the website and decode to Unicode
soup = BeautifulSoup(html)
html_unicode = unicode(soup)
text_unicode = nltk.clean_html(html_unicode)
return text_unicode
#! /usr/bin/python
# coding: utf-8 -*-
import sys
import time
import string
import logging
import urllib
import requests
import codecs
from bs4 import BeautifulSoup
# schema of URLs:
# http://www.jw.org/de/publikationen/bibel/nwt/bibelbuecher/prediger/4/#v21004007
# replace this dict and adapt baseurl to receive texts in another language
biblebooks = {"1. Korinther": 46, "Habakuk": 35, "Epheser": 49, "2. Timotheus": 55, "Haggai": 37, "1. Samuel": 9, "Johannes": 43, "Jona": 32, "Daniel": 27, "Zephanja": 36, "1. Petrus": 60, "2. Chronika": 14, "Ruth": 8, "Judas": 65, "1. Mose": 1, "Esther": 17, "Jakobus": 59, "Maleachi": 39, "1. Johannes": 62, "Klagelieder": 25, "2. Mose": 2, "Kolosser": 51, "2. Korinther": 47, "1. Könige": 11, "Prediger": 21, "Micha": 33, "Philipper": 50, "Galater": 48, "Josua": 6, "Markus": 41, "Joel": 29, "Lukas": 42, "Hohes Lied": 22, "Jeremia": 24, "Hosea": 28, "Hiob": 18, "1. Timotheus": 54, "Psalm": 19, "2. Thessalonicher": 53, "Nehemia": 16, "5. Mose": 5, "Amos": 30, "Obadja": 31, "Apostelgeschichte": 44, "1. Chronika": 13, "Richter": 7, "4. Mose": 4, "Nahum": 34, "Matthäus": 40, "Römer": 45, "Sprüche": 20, "3. Johannes": 64, "Jesaja": 23, "Hesekiel": 26, "Hebräer": 58, "Sacharja": 38, "Titus": 56, "Philemon": 57, "Esra": 15, "Offenbarung": 66, "2. Könige": 12, "3. Mose": 3, "2. Johannes": 63, "2. Samuel": 10, "2. Petrus": 61, "1. Thessalonicher": 52}
one_chapter_books = ['Obadja', 'Philemon', 'Judas', '2. Johannes', '3. Johannes']
for k in biblebooks.keys():
biblebooks[unicode(k.decode('utf-8'))] = biblebooks.pop(k)
# for German bible texts only
baseurl = 'http://www.jw.org/de/publikationen/bibel/nwt/bibelbuecher/'
"""Takes as input the text's "address" as string or unicode and returns the text's content as unicode, downloaded from
the German New World Translation published by JW on jw.org.
Example call: nwt_textfinder.get_bible_text_german(u'Sprüche 18:10')
Returns: u'10\xa0\xa0Der Name Jehovas ist ein starker Turm.+ Der Gerechte l\xe4uft hinein und wird besch\xfctzt.*+\n\n'
Also downloads multiple texts like "1. Mose 1:1, 2" or "1. Mose 1:1-3"
"""
def get_bible_text_german(text):
text = unicode(text.strip())
text = text.replace(u'\xa0', u' ')
while not text[-1].isdigit():
text = text[:-1]
logging.debug('Now starting to look up this text: %s' %text)
if ':' in text:
book_chapter, verse = text.split(':', 1)
try:
book, chapter = book_chapter.rsplit(' ', 1)
except ValueError:
logging.error('No valid bible text - there must be a space between book and chapter: %s' %text)
return ''
else:
book_found = False
for b in one_chapter_books:
if b in text:
logging.warning('This is a text from a book with one chapter: %s' %text)
try:
book, verse = text.rsplit(' ', 1)
except ValueError:
logging.error('No valid bible text - there must be a space between book and verse: %s' %text)
chapter = '1'
book_found = True
if not book_found:
logging.error('No valid text! There must be a colon between the chapter and the verse: %s' %text)
return ''
logging.debug('Book: %s' %book)
logging.debug('Chapter: %s' %chapter)
logging.debug('Verse: %s' %verse)
if '-' in verse:
start_verse, end_verse = verse.split('-')
verselist = range(int(start_verse.strip()), int(end_verse.strip())+1)
verselist = [str(v) for v in verselist]
elif ',' in verse:
a, b = verse.split(',')
verselist = [a.strip(), b.strip()]
else:
verselist = [verse.replace(';','').replace(',','')]
verselist = [v for v in verselist if v]
logging.debug('Will now look up these verse(s): %s' %str(verselist))
book = book.replace(u'\xa0', ' ')
texturl = book.replace('. ', '-') + '/' + chapter + '/'
r = requests.get(baseurl + texturl)
soup = BeautifulSoup(r.content)
try:
bookid = str(biblebooks[book])
except KeyError as err:
logging.error('Book %s was not found! %s' %(book, err))
result = ''
for v in verselist:
textid = 'v' + bookid + (3 - len(chapter)) * '0' + chapter + (3 - len(v)) * '0' + v
logging.debug('ID of the texts element: %s' %textid)
souptext = soup.find(id=textid)
if souptext:
result += souptext.text + '\n'
else:
logging.error('No text found for this verse: %s' %v)
return result
# various Python code snippets
#!/usr/bin/env python
# scrape Wikipedia table and export as .csv file
import urllib, urllib2
import unicodecsv as csv
import json
import codecs
from BeautifulSoup import BeautifulSoup
def main(url, table_names):
all_data = list()
fh = urllib.urlopen(url)
content = fh.read()
soup = BeautifulSoup(content)
tables = soup.findAll('table', 'wikitable') #excludes vertical navigation box with article summary
for table_no, table in enumerate(tables[:-1]):
if table_no == 0:
heads = [th.text for th in table.findAll('th')]
all_data.append(heads + [u'Continent'])
for row in table.findAll('tr'):
row_data = []
if row.findAll('th'):
continue
for table_data in row.findAll('td'):
if table_data.a: row_data.append(table_data.a.text)
else: row_data.append(table_data.text)
row_data.append(table_names[table_no])
all_data.append(row_data)
# csv module cannot handle unicode (shame on you)
all_data_utf8 = []
for line in all_data:
all_data_utf8.append([unicode(s).encode('utf-8') for s in line])
out = open('all_data.csv', 'w')
writer = csv.writer(out, dialect='excel', encoding='utf-8')
for row in all_data_utf8:
print row
writer.writerow(row)
out.close()
if __name__ == '__main__':
url = r"https://en.wikipedia.org/wiki/Jehovah's_Witnesses_by_country"
# TODO
table_names = [u'Africa', u'North America', u'Caribbean', u'South America', u'Asia', u'Europe', u'Oceania', u'Other']
main(url, table_names)
'''Send mobi files to kindle email address with python's email package'''
def send2kindle(f):
msg = MIMEMultipart()
msg['Subject'] = "Neue Zeitschriftenlieferung!"
msg['From'] =
msg['To'] = sender_email
msg['Date'] = formatdate(localtime=True)
message = "Neue Zeitschriften im Anhang"
msg.attach(MIMEText(message))
part = MIMEApplication(open(f, 'rb').read(), _subtype='application/x-mobipocket-ebook')
part.add_header('Content-Disposition', 'attachment; filename="%s"' % os.path.basename(f))
msg.attach(part)
# connect to Google's SMTP server
mailserver = smtplib.SMTP(smtpserver, port)
mailserver.ehlo()
mailserver.starttls()
mailserver.ehlo()
mailserver.login(sender_email, sender_password)
mailserver.sendmail(sender_email, kindles_email, msg.as_string())
mailserver.close()
# choose a key from a dictionary randomly - the probability of a key getting chosen is commensurate to its value
# d ist dictionary mit index - count werten
def simpleProbDist(d):
i = 1
d2 = dict()
for k in d.keys():
r = (i, i+d[k])
#print r
i += d[k]
#print k
d2[k] = r
x = random.randint(1, i-1)
#print "x: " + str(x)
for k in d2.keys():
if x in range(*d2[k]):
chosen = k
break
return chosen
#remove punctuation:
import string
table = string.maketrans("","")
def rem_punct(s):
return s.translate(table, string.punctuation)
#lemmatizing of nouns and verbs with the Nodebox Linguistics Library und WordNet
#perform POS tagging before to improve precision
def lemmatize(words):
lemmas = []
for w in words:
if en.is_verb(w):
lemmas.append(en.verb.infinitive(w.lower()))
elif en.is_noun(w):
lemmas.append(en.noun.singular(w.lower()))
else:
lemmas.append(w.lower())
return lemmas
# hopefully i won't forget them anymore when i wrote them down here :-)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
str(datetime.datetime.now())
if __name__ == '__main__':
# only Python 2, not 3
except IOerror as e:
print >> stderr, "IOError: %s" %e.strerror
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
# execute bash command
subprocess.Popen(['/bin/bash', '-c', cmd])
# get output of bash command in python
output = subprocess.check_output(['/bin/bash', '-c', cmd])
# get stderr
output = subprocess.check_output('ls', stderr=subprocess.STDOUT)
Pylab / Numpy etc.
data = np.genfromtxt('all_absolute.csv', delimiter=';')
data = np.array(1, 2, 3], [4, 5, 6)
plt.imshow(data4, interpolation='nearest', cmap=plt.cm.bone)
import wikipedia #https://github.com/goldsmith/Wikipedia
import urllib
import re
msg = 'Error!'
try:
p = wikipedia.page(page_title)
# if you know only the page_title and url but not the pageid you have to
# find it in the html code of the wikipedia page before you can use
# the API functions
except wikipedia.DisambiguationError as err:
print err
fh = urllib.urlopen(baseurl + page_link)
if fh:
html = fh.read()
r = re.search('"wgArticleId":(\d+),', html)
groups = r.groups()
if groups:
pageid = groups[0]
p = wikipedia.page(pageid=pageid)
else:
print msg
return
else:
print msg
return
#!/usr/bin/python
import codecs
import string
import re
import logging
import nltk
import gensim
import en
# before starting the script install Cython with `pip install cython` to use optimized word2vec training (70x speedup).
# Creates and saves a word2vec model from a big (some MB) file of raw text.
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
global table
table = string.maketrans("","")
def remove_punct(s):
return s.translate(table, string.punctuation)
def lemmatize(words):
lemmas = []
for w in words:
if en.is_verb(w):
lemmas.append(en.verb.infinitive(w.lower()))
elif en.is_noun(w):
lemmas.append(en.noun.singular(w.lower()))
else:
lemmas.append(w.lower())
return lemmas
class WordPreparer(object):
def __init__(self, fname):
self.fname = fname
fh = codecs.open(fname, mode='r', encoding='utf-8')
text = fh.read()
self.sents = nltk.sent_tokenize(text)
self.sents = [remove_punct(s.encode('utf-8')) for s in self.sents]
def __iter__(self):
for s in self.sents:
yield lemmatize(remove_punct(s).split())
if __name__ == '__main__':
words = WordPreparer('raw_text.txt')
model = gensim.models.Word2Vec(words, min_count=3, workers=2)
model.save('word2vec_model')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment