strogonoff/hnsavedsearch.py

## hnsavedsearch.py
#coding: utf-8

u"""
Searches your saved HN items. Many false positives (overkill with synonyms).
..

    $ pip install mechanize nltk
    $ python hnsavedsearch.py username "space separated query"

"""

if __name__ != '__main__':
    raise ImportError("hnsavedsearch isn't supposed to be imported")

import argparse

parser = argparse.ArgumentParser(
    description="Search your HN saved stories by title text.")

parser.add_argument('username', type=str)
parser.add_argument('query', type=str)

args = parser.parse_args()


# Mechanize setup

import mechanize
import cookielib

br = mechanize.Browser()

cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)

br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)

br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

#br.set_debug_http(True)
#br.set_debug_redirects(True)
#br.set_debug_responses(True)

br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]


# NLTK check

try:
    from nltk.corpus import wordnet
    wordnet.synsets('cake')
except LookupError:
    print "wordnet corpus appears to be not installed, initiating download. Download to home directory!"
    import nltk
    result = nltk.download()
    if result == True:
        print "Installation hopefully successful"
        from ntlk.corpus import wordnet


# Log in

import getpass

br.open('https://news.ycombinator.com/newslogin?whence=news')
br.select_form(nr=0)
br.form['u'] = args.username
br.form['p'] = getpass.getpass("Pass for %s: " % args.username)
br.submit()


# Prepare search

def lemmas(words, synonyms=False):
    from nltk import wordnet as wn

    if synonyms:
        lemmas = set(lemma
            for word in words
            for synset in wordnet.synsets(word)
            for lemma in synset.lemma_names)

    else:
        lemmas = set(wn.WordNetLemmatizer().lemmatize(word)
            for word in words)

    return lemmas.union(set(words))

query = lemmas(args.query.split())

print "Original query: %s" % args.query
print "      expanded: %s" % ', '.join(w for w in query)


# Search

def iterate_links(url):
    global _page
    global _links_processed
    global _matches_found

    _match = None

    br.open(url)

    for link in br.links():

        # Internal links
        if 'news.ycombinator.com' in link.absolute_url:

            if link.url.startswith('item?id=') and _match is not None:
                print "{:<30} \"{}\" on page {}".format(
                    link.absolute_url, _match, _page)
                _match = None
                continue

            elif link.text == "More":
                _page += 1
                iterate_links(link.absolute_url)
                break

        # External link
        if query.intersection(lemmas(link.text.split(), True)):
            _matches_found += 1
            _match = link.text

        _links_processed += 1


try:
    _page = 1
    _matches_found = 0
    _links_processed = 0

    iterate_links('https://news.ycombinator.com/saved?id=%s' % args.username)

except KeyboardInterrupt:
    print "\n"
    print "Interrupted on page {}".format(_page)
    print "Links processed: {}".format(_links_processed)
    print "Matches found: {}".format(_matches_found)
	#coding: utf-8

	u"""
	Searches your saved HN items. Many false positives (overkill with synonyms).
	..

	$ pip install mechanize nltk
	$ python hnsavedsearch.py username "space separated query"

	"""

	if __name__ != '__main__':
	raise ImportError("hnsavedsearch isn't supposed to be imported")

	import argparse

	parser = argparse.ArgumentParser(
	description="Search your HN saved stories by title text.")

	parser.add_argument('username', type=str)
	parser.add_argument('query', type=str)

	args = parser.parse_args()


	# Mechanize setup

	import mechanize
	import cookielib

	br = mechanize.Browser()

	cj = cookielib.LWPCookieJar()
	br.set_cookiejar(cj)

	br.set_handle_equiv(True)
	br.set_handle_gzip(True)
	br.set_handle_redirect(True)
	br.set_handle_referer(True)
	br.set_handle_robots(False)

	br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

	#br.set_debug_http(True)
	#br.set_debug_redirects(True)
	#br.set_debug_responses(True)

	br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]


	# NLTK check

	try:
	from nltk.corpus import wordnet
	wordnet.synsets('cake')
	except LookupError:
	print "wordnet corpus appears to be not installed, initiating download. Download to home directory!"
	import nltk
	result = nltk.download()
	if result == True:
	print "Installation hopefully successful"
	from ntlk.corpus import wordnet


	# Log in

	import getpass

	br.open('https://news.ycombinator.com/newslogin?whence=news')
	br.select_form(nr=0)
	br.form['u'] = args.username
	br.form['p'] = getpass.getpass("Pass for %s: " % args.username)
	br.submit()


	# Prepare search

	def lemmas(words, synonyms=False):
	from nltk import wordnet as wn

	if synonyms:
	lemmas = set(lemma
	for word in words
	for synset in wordnet.synsets(word)
	for lemma in synset.lemma_names)

	else:
	lemmas = set(wn.WordNetLemmatizer().lemmatize(word)
	for word in words)

	return lemmas.union(set(words))

	query = lemmas(args.query.split())

	print "Original query: %s" % args.query
	print " expanded: %s" % ', '.join(w for w in query)


	# Search

	def iterate_links(url):
	global _page
	global _links_processed
	global _matches_found

	_match = None

	br.open(url)

	for link in br.links():

	# Internal links
	if 'news.ycombinator.com' in link.absolute_url:

	if link.url.startswith('item?id=') and _match is not None:
	print "{:<30} \"{}\" on page {}".format(
	link.absolute_url, _match, _page)
	_match = None
	continue

	elif link.text == "More":
	_page += 1
	iterate_links(link.absolute_url)
	break

	# External link
	if query.intersection(lemmas(link.text.split(), True)):
	_matches_found += 1
	_match = link.text

	_links_processed += 1


	try:
	_page = 1
	_matches_found = 0
	_links_processed = 0

	iterate_links('https://news.ycombinator.com/saved?id=%s' % args.username)

	except KeyboardInterrupt:
	print "\n"
	print "Interrupted on page {}".format(_page)
	print "Links processed: {}".format(_links_processed)
	print "Matches found: {}".format(_matches_found)