KarimJedda/hngen.py

## hngen.py
# coding: utf8
import urllib.request
import re
import sys
from collections import defaultdict
from random import random
import time

"""
# No need to run this
# You can find a list here:
#
# or remove #potato

def get_titles():
    with open("archive.txt","w") as archive:
        #potato for year in range(17,22):
            for month in range(1,13):
                for day in range(1,32):
                    try:
                        print("https://www.daemonology.net/hn-daily/20%02d-%02d-%02d.html" % (year, month, day))
                        response = urllib.request.urlopen("https://www.daemonology.net/hn-daily/20%02d-%02d-%02d.html" % (year, month, day))
                        html = response.read().decode('utf-8')
                        titles = re.findall(r'ylink"><[^>]*>([^<]*)', str(html))
                        for title in titles:
                            archive.write(title+"\n")
                        time.sleep(1)
                    except:
                        print("woopsie")

get_titles()
"""

archive = open("archive.txt")
titles = archive.read().split("\n")
archive.close()
markov_map = defaultdict(lambda:defaultdict(int))

lookback = 2

#Generate map in the form word1 -> word2 -> occurences of word2 after word1
for title in titles[:-1]:
    title = title.split()
    if len(title) > lookback:
        for i in range(len(title)+1):
            markov_map[' '.join(title[max(0,i-lookback):i])][' '.join(title[i:i+1])] += 1

#Convert map to the word1 -> word2 -> probability of word2 after word1
for word, following in markov_map.items():
    total = float(sum(following.values()))
    for key in following:
        following[key] /= total

#Typical sampling from a categorical distribution
def sample(items):
    next_word = None
    t = 0.0
    for k, v in items:
        t += v
        if t and random() < v/t:
            next_word = k
    return next_word

sentences = []
while len(sentences) < 100:
    sentence = []
    next_word = sample(markov_map[''].items())
    while next_word != '':
        sentence.append(next_word)
        next_word = sample(markov_map[' '.join(sentence[-lookback:])].items())
    sentence = ' '.join(sentence)
    flag = True
    for title in titles: #Prune titles that are substrings of actual titles
        if sentence in title:
            flag = False
            break
    if flag:
        sentences.append(sentence)

for sentence in sentences:
    print(sentence)
	# coding: utf8
	import urllib.request
	import re
	import sys
	from collections import defaultdict
	from random import random
	import time

	"""
	# No need to run this
	# You can find a list here:
	#
	# or remove #potato

	def get_titles():
	with open("archive.txt","w") as archive:
	#potato for year in range(17,22):
	for month in range(1,13):
	for day in range(1,32):
	try:
	print("https://www.daemonology.net/hn-daily/20%02d-%02d-%02d.html" % (year, month, day))
	response = urllib.request.urlopen("https://www.daemonology.net/hn-daily/20%02d-%02d-%02d.html" % (year, month, day))
	html = response.read().decode('utf-8')
	titles = re.findall(r'ylink"><[^>]>([^<])', str(html))
	for title in titles:
	archive.write(title+"\n")
	time.sleep(1)
	except:
	print("woopsie")

	get_titles()
	"""

	archive = open("archive.txt")
	titles = archive.read().split("\n")
	archive.close()
	markov_map = defaultdict(lambda:defaultdict(int))

	lookback = 2

	#Generate map in the form word1 -> word2 -> occurences of word2 after word1
	for title in titles[:-1]:
	title = title.split()
	if len(title) > lookback:
	for i in range(len(title)+1):
	markov_map[' '.join(title[max(0,i-lookback):i])][' '.join(title[i:i+1])] += 1

	#Convert map to the word1 -> word2 -> probability of word2 after word1
	for word, following in markov_map.items():
	total = float(sum(following.values()))
	for key in following:
	following[key] /= total

	#Typical sampling from a categorical distribution
	def sample(items):
	next_word = None
	t = 0.0
	for k, v in items:
	t += v
	if t and random() < v/t:
	next_word = k
	return next_word

	sentences = []
	while len(sentences) < 100:
	sentence = []
	next_word = sample(markov_map[''].items())
	while next_word != '':
	sentence.append(next_word)
	next_word = sample(markov_map[' '.join(sentence[-lookback:])].items())
	sentence = ' '.join(sentence)
	flag = True
	for title in titles: #Prune titles that are substrings of actual titles
	if sentence in title:
	flag = False
	break
	if flag:
	sentences.append(sentence)

	for sentence in sentences:
	print(sentence)