Created
June 17, 2013 04:24
-
-
Save danilobellini/5794639 to your computer and use it in GitHub Desktop.
Get some statistics from bible
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# Created on Sun Jun 16 05:19:49 2013 | |
# Danilo de Jesus da Silva Bellini | |
""" Get some statistics from bible """ | |
from __future__ import print_function, unicode_literals | |
import os, zipfile, random | |
try: # Python 2 | |
from urllib2 import urlopen | |
from string import letters as ascii_letters | |
except ImportError: # Python 3 | |
from urllib.request import urlopen | |
from string import ascii_letters | |
bible_link = "http://printkjv.ifbweb.com/" | |
zip_name = "AV_txt.zip" | |
class DownloadError(Exception): | |
pass | |
def download_file(url, output_filename): | |
data = urlopen(url) | |
code = data.getcode() | |
if code != 200: | |
raise DownloadError("Error {}".format(code)) | |
with open(output_filename, "wb") as f: | |
f.write(data.read()) | |
def get_bible_data(): | |
if not os.path.exists(zip_name) or not zipfile.is_zipfile(zip_name): | |
download_file(bible_link + zip_name, zip_name) | |
with zipfile.ZipFile(zip_name) as zf: | |
return zf.read(zf.namelist()[0]).decode("utf-8") | |
def new_random_word(): | |
msg = list(ascii_letters) | |
size = random.randrange(3, len(msg)) | |
random.shuffle(msg) | |
return "".join(msg[:size]) | |
if __name__ == "__main__": | |
words = get_bible_data().split() | |
set_words = set(words) | |
print("Total whitespace-separated items:", len(words)) | |
print("Unique (case sensitive):", len(set_words)) | |
# Comparison for Junior Polegato | |
print() | |
nw = [words[5], words[37], words[458]] | |
nw.extend(new_random_word() for _ in range(17)) | |
print("New words:", nw) | |
print() | |
print("Repeated:", len([x for x in nw if x in words])) | |
print("Repeated in set:", len([x for x in nw if x in set_words])) | |
final = words + [x for x in nw if x in words] | |
print("Final size, appending the new words:", len(final)) | |
print("Final size, adding to set:", len(set(final))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment