Skip to content

Instantly share code, notes, and snippets.

@KarimJedda
Forked from grantslatton/hngen.py
Last active September 27, 2021 12:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save KarimJedda/42359052d4a166791b035ae15e7b4a75 to your computer and use it in GitHub Desktop.
Save KarimJedda/42359052d4a166791b035ae15e7b4a75 to your computer and use it in GitHub Desktop.
A program that uses Markov chains to generate probabilistic Hacker News titles.
# coding: utf8
import urllib.request
import re
import sys
from collections import defaultdict
from random import random
import time
"""
# No need to run this
# You can find a list here:
#
# or remove #potato
def get_titles():
with open("archive.txt","w") as archive:
#potato for year in range(17,22):
for month in range(1,13):
for day in range(1,32):
try:
print("https://www.daemonology.net/hn-daily/20%02d-%02d-%02d.html" % (year, month, day))
response = urllib.request.urlopen("https://www.daemonology.net/hn-daily/20%02d-%02d-%02d.html" % (year, month, day))
html = response.read().decode('utf-8')
titles = re.findall(r'ylink"><[^>]*>([^<]*)', str(html))
for title in titles:
archive.write(title+"\n")
time.sleep(1)
except:
print("woopsie")
get_titles()
"""
archive = open("archive.txt")
titles = archive.read().split("\n")
archive.close()
markov_map = defaultdict(lambda:defaultdict(int))
lookback = 2
#Generate map in the form word1 -> word2 -> occurences of word2 after word1
for title in titles[:-1]:
title = title.split()
if len(title) > lookback:
for i in range(len(title)+1):
markov_map[' '.join(title[max(0,i-lookback):i])][' '.join(title[i:i+1])] += 1
#Convert map to the word1 -> word2 -> probability of word2 after word1
for word, following in markov_map.items():
total = float(sum(following.values()))
for key in following:
following[key] /= total
#Typical sampling from a categorical distribution
def sample(items):
next_word = None
t = 0.0
for k, v in items:
t += v
if t and random() < v/t:
next_word = k
return next_word
sentences = []
while len(sentences) < 100:
sentence = []
next_word = sample(markov_map[''].items())
while next_word != '':
sentence.append(next_word)
next_word = sample(markov_map[' '.join(sentence[-lookback:])].items())
sentence = ' '.join(sentence)
flag = True
for title in titles: #Prune titles that are substrings of actual titles
if sentence in title:
flag = False
break
if flag:
sentences.append(sentence)
for sentence in sentences:
print(sentence)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment