dreampuf/V2exRelaTopNWord.py

## V2exRelaTopNWord.py
#!/usr/bin/env python
# coding: utf-8

__author__ = "soddyque@gmail.com"

import tempfile
import cPickle as pickle
from collections import Counter

from xgoogle.search import GoogleSearch


def download_data(w):
    gs = GoogleSearch(w)
    gs.results_per_page = 100
    tpath = tempfile.mktemp()
    with open(tpath, "wb") as f:
        while True:
            res = gs.get_results()
            if not res:
                break
            pickle.dump(res, f)
            print gs.page

    print "Data cached at %s" % tpath
    #tpath = "/var/folders/sr/m9lk94tn4x75p8nyg6dm31cr0000gn/T/tmpE4gF7b"
    with open(tpath, "rb") as f:
        rs = []
        while True:
            try:
                rs.extend(pickle.load(f))
            except EOFError:
                break
    return rs

def take_top_n(data, n):
    from jieba import posseg as pseg
    cter = Counter([w.word for i in data
                   for w in pseg.cut(i.title) if w.flag == "n" ])
    for k, freq in cter.most_common(n):
        yield k


if __name__ == "__main__":
    data = download_data("site:v2ex.com/t 推荐")
    for i in take_top_n(data, 50):
        print i
	#!/usr/bin/env python
	# coding: utf-8

	__author__ = "soddyque@gmail.com"

	import tempfile
	import cPickle as pickle
	from collections import Counter

	from xgoogle.search import GoogleSearch


	def download_data(w):
	gs = GoogleSearch(w)
	gs.results_per_page = 100
	tpath = tempfile.mktemp()
	with open(tpath, "wb") as f:
	while True:
	res = gs.get_results()
	if not res:
	break
	pickle.dump(res, f)
	print gs.page

	print "Data cached at %s" % tpath
	#tpath = "/var/folders/sr/m9lk94tn4x75p8nyg6dm31cr0000gn/T/tmpE4gF7b"
	with open(tpath, "rb") as f:
	rs = []
	while True:
	try:
	rs.extend(pickle.load(f))
	except EOFError:
	break
	return rs

	def take_top_n(data, n):
	from jieba import posseg as pseg
	cter = Counter([w.word for i in data
	for w in pseg.cut(i.title) if w.flag == "n" ])
	for k, freq in cter.most_common(n):
	yield k



	if __name__ == "__main__":
	data = download_data("site:v2ex.com/t 推荐")
	for i in take_top_n(data, 50):
	print i