Created
January 23, 2015 12:03
-
-
Save skurmedel/1bf7e224dbb7c466ada7 to your computer and use it in GitHub Desktop.
Quotes timecube.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib.request import * | |
from html.parser import HTMLParser | |
import sys | |
import os | |
import random | |
class _TimecubeParser(HTMLParser): | |
def __init__(self): | |
super().__init__(self) | |
self.in_span = False | |
self.pieces = [] | |
def handle_starttag(self, tag, attrs): | |
if tag.lower() == "span": | |
self.in_span = True | |
def handle_endtag(self, tag): | |
if tag.lower() == "span": | |
self.in_span = False | |
def handle_data(self, data): | |
if not self.in_span: | |
return | |
if data == "\r\n\r\n" or data == "\r\n" or data == " ": | |
# The markup is pretty strange so lets ignore | |
# some weird occurances. | |
pass | |
else: | |
self.pieces.append(data.replace("\r\n", " ").strip()) | |
def get_page_bytes(): | |
data = b'' | |
with urlopen("http://www.timecube.com/") as r: | |
#print(r.status, r.reason) | |
data = r.read() | |
return data | |
def get_random_text(n = 3): | |
b = get_page_bytes() | |
# Assume utf-8 | |
text = b.decode("utf-8", "ignore") | |
parser = _TimecubeParser() | |
parser.feed(text) | |
text = set(map(lambda p: p.strip() + ".", " ".join(parser.pieces).split("."))) | |
return random.sample(text, n) | |
if __name__ == '__main__': | |
print("\n\n".join(get_random_text())) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment