Skip to content

Instantly share code, notes, and snippets.

@dreampuf
Created April 21, 2013 02:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dreampuf/5428287 to your computer and use it in GitHub Desktop.
Save dreampuf/5428287 to your computer and use it in GitHub Desktop.
a collection of high-frequency words the v2ex subject matter
#!/usr/bin/env python
# coding: utf-8
__author__ = "soddyque@gmail.com"
import tempfile
import cPickle as pickle
from collections import Counter
from xgoogle.search import GoogleSearch
def download_data(w):
gs = GoogleSearch(w)
gs.results_per_page = 100
tpath = tempfile.mktemp()
with open(tpath, "wb") as f:
while True:
res = gs.get_results()
if not res:
break
pickle.dump(res, f)
print gs.page
print "Data cached at %s" % tpath
#tpath = "/var/folders/sr/m9lk94tn4x75p8nyg6dm31cr0000gn/T/tmpE4gF7b"
with open(tpath, "rb") as f:
rs = []
while True:
try:
rs.extend(pickle.load(f))
except EOFError:
break
return rs
def take_top_n(data, n):
from jieba import posseg as pseg
cter = Counter([w.word for i in data
for w in pseg.cut(i.title) if w.flag == "n" ])
for k, freq in cter.most_common(n):
yield k
if __name__ == "__main__":
data = download_data("site:v2ex.com/t 推荐")
for i in take_top_n(data, 50):
print i
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment