Skip to content

Instantly share code, notes, and snippets.

@strogonoff
Created March 12, 2014 07:02
Show Gist options
  • Save strogonoff/9502160 to your computer and use it in GitHub Desktop.
Save strogonoff/9502160 to your computer and use it in GitHub Desktop.
Search your saved HN items
#coding: utf-8
u"""
Searches your saved HN items. Many false positives (overkill with synonyms).
..
$ pip install mechanize nltk
$ python hnsavedsearch.py username "space separated query"
"""
if __name__ != '__main__':
raise ImportError("hnsavedsearch isn't supposed to be imported")
import argparse
parser = argparse.ArgumentParser(
description="Search your HN saved stories by title text.")
parser.add_argument('username', type=str)
parser.add_argument('query', type=str)
args = parser.parse_args()
# Mechanize setup
import mechanize
import cookielib
br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
#br.set_debug_http(True)
#br.set_debug_redirects(True)
#br.set_debug_responses(True)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
# NLTK check
try:
from nltk.corpus import wordnet
wordnet.synsets('cake')
except LookupError:
print "wordnet corpus appears to be not installed, initiating download. Download to home directory!"
import nltk
result = nltk.download()
if result == True:
print "Installation hopefully successful"
from ntlk.corpus import wordnet
# Log in
import getpass
br.open('https://news.ycombinator.com/newslogin?whence=news')
br.select_form(nr=0)
br.form['u'] = args.username
br.form['p'] = getpass.getpass("Pass for %s: " % args.username)
br.submit()
# Prepare search
def lemmas(words, synonyms=False):
from nltk import wordnet as wn
if synonyms:
lemmas = set(lemma
for word in words
for synset in wordnet.synsets(word)
for lemma in synset.lemma_names)
else:
lemmas = set(wn.WordNetLemmatizer().lemmatize(word)
for word in words)
return lemmas.union(set(words))
query = lemmas(args.query.split())
print "Original query: %s" % args.query
print " expanded: %s" % ', '.join(w for w in query)
# Search
def iterate_links(url):
global _page
global _links_processed
global _matches_found
_match = None
br.open(url)
for link in br.links():
# Internal links
if 'news.ycombinator.com' in link.absolute_url:
if link.url.startswith('item?id=') and _match is not None:
print "{:<30} \"{}\" on page {}".format(
link.absolute_url, _match, _page)
_match = None
continue
elif link.text == "More":
_page += 1
iterate_links(link.absolute_url)
break
# External link
if query.intersection(lemmas(link.text.split(), True)):
_matches_found += 1
_match = link.text
_links_processed += 1
try:
_page = 1
_matches_found = 0
_links_processed = 0
iterate_links('https://news.ycombinator.com/saved?id=%s' % args.username)
except KeyboardInterrupt:
print "\n"
print "Interrupted on page {}".format(_page)
print "Links processed: {}".format(_links_processed)
print "Matches found: {}".format(_matches_found)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment