joelthelion/buzz.py

## buzz.py
#!/usr/bin/env python
# coding=utf8
#Identify the newest trends
from __future__ import division
import os
import re
import cPickle
import time
import sys

MAXIMUM_TOTAL_WEIGHT=10000 #Maximum number of word counts. After, apply geometric decay
REPEAT_INTERVAL_DAYS=30 #Minimum number of days without seing a word required to count it as original again
FOCUS_DAYS=1 #Number of days a word stays interesting
my_reddits="programming,technology,linux,xkcd,productivity,Health,newreddits,Physics,c_language,science,business,worldnews,math,Python,startups,bioinformatics,meta,smart,shell".split(",")

def tokenize(text):
    text=re.sub(u"""[/1234567890=@\-#…«»”“’‘.!"'()*,:;<>?\[\]`{|}~&]"""," ",text).lower()
    return text.split()

def add(dict,key):
    dict[key]=dict.get(key,0)+1

def add_tuple(dict,key,default_value,index=0):
    values=list(dict.get(key,default_value))
    values[index]+=1
    #print "add_tuple test: ",key.encode("utf-8"),values
    dict[key]=tuple(values)

def get_feed_stories(feeds=["http://digg.com/rss/index.xml","http://reddit.com/r/all/.rss","http://www.lemonde.fr/rss/sequence/0,2-3208,1-0,0.xml","http://linuxfr.org/backend/news-homepage/rss20.rss","http://del.icio.us/rss/","http://www.lefigaro.fr/rss/figaro_actualites.xml","http://news.ycombinator.com/rss","http://linuxfr.org/backend/journaux/rss20.rss","http://www.lepoint.fr/content/system/rss/a_la_une/a_la_une_doc.xml","http://rss.feedsportal.com/c/568/f/7295/index.rss","http://www.marianne2.fr/xml/syndication.rss","http://syndication.lesechos.fr/rss/rss_une.xml","http://blogs.lexpress.fr/attali/index.xml","http://feeds.feedburner.com/consommateur-si-tu-savais","http://www.reddit.com/r/AskReddit/","http://tempsreel.nouvelobs.com/file/rss_perm/rss_permanent.xml","http://top25.sciencedirect.com/rss.php?subject_area_id=17&journal_id=13618415","http://rss.sciencedirect.com/getMessage?registrationId=IHHEIIHEJNHFQHIGKHHLIMJFJLKKLLKJNZJMLPOLMN","http://feedproxy.google.com/Phoronix","http://rss.feedsportal.com/c/499/f/413823/index.rss","http://www.slate.fr/rss.xml"]):
    # disabled: "http://www.liberation.fr/interactif/rss/actualites/",
    import feedparser
    stories=[]
    for r in my_reddits:
        feeds.append("http://reddit.com/r/%s/.rss"%r)
    for f in feeds:
        print "Fetching %s..." % f
        try:
            stories.extend((entry.title,f) for entry in feedparser.parse(f).entries)
        except:
            print "Error parsing %s..." % f
    return stories

def get_object_from_file(filename,default={}):
    try:
        f=open(filename)
        mydict=cPickle.load(f)
        f.close()
    except IOError,e:
        print "%s file not found, creating a new one..." %filename
        mydict=default
    return mydict

def downsize_counts(already_seen):
    """Keeps word counts reasonable using geometric decay, so that new trends don't go unnoticed"""
    total=sum(count for now,count in already_seen.values())
    if total>MAXIMUM_TOTAL_WEIGHT*1.01: #*1.01 so we don't do it every time
        print "Total count too big (%d), downsizing counts..." % total
        for k,(now,old_count) in already_seen.items():
            already_seen[k]=now,old_count/(total/MAXIMUM_TOTAL_WEIGHT)

def show_original_stuff():
    now=time.time()
    import datetime
    today=datetime.date.fromtimestamp(now).toordinal()
    already_seen,distinct_use_days,last_use_day,todays_words=get_object_from_file(os.path.expanduser("~/.popurls_alreadyseen.pck"),({},0,0,{}))
    downsize_counts(already_seen)
    already_seen_links=get_object_from_file(os.path.expanduser("~/.popurls_alreadyseen_links.pck"),set())
    time_fetched,story_ratings=get_object_from_file(os.path.expanduser("~/.popurls.pck"),(0,None))

    if story_ratings is None or now-time_fetched>10 * 60: #if file is older than ten minutes
        common=set(unicode(open(os.path.join(os.path.dirname(os.path.realpath(__file__)),"common.txt")).read()[:-1],"utf8").split(","))
        all_stories=get_feed_stories()
        stories=[]
        for s,feed in all_stories:
            if s not in already_seen_links:
                already_seen_links.add(s)
                stories.append((s,feed))
        raw_text=" ".join(s for s,f in stories)
        if not stories:
            print "No new stories found"
            import sys
            sys.exit()
        for word,(count,time_added) in todays_words.items():
            if now-time_added > 86400 * FOCUS_DAYS:
                del todays_words[word]
        if today > last_use_day:
            distinct_use_days+=1
        time_fetched=now
        for word in tokenize(raw_text):
            if word not in common and len(word)>=2:#Common words don't interest us
                if word in already_seen:
                    if word in todays_words:
                        add_tuple(todays_words,word,default_value=(0,now),index=0)
                    already_seen[word]=distinct_use_days,already_seen[word][1]+1 #this word is still being seen
                else:
                    already_seen[word]=distinct_use_days,1
                    add_tuple(todays_words,word,default_value=(0,now),index=0)
        #compute rated stories
        story_ratings=[ (story,int(100*sum(todays_words[w][0] for w in story_words \
                        if w in todays_words)/len(story_words)),feed)\
                for story,story_words,feed in ((s,tokenize(s),feed) for s,feed in stories)\
                if len(story_words)>0]
        story_ratings.sort(key=lambda e:e[1])

    for k,(t,dummy) in already_seen.items(): #If a keyword hasn't been seen in a month, it's interesting again
        if distinct_use_days-t>REPEAT_INTERVAL_DAYS:
            print ("Cleanup: removed %s from seen dictionnary" % k).encode('utf-8')
            del already_seen[k]
    print "Eliminated %d unoriginal stories" % sum(1 for s,rating,feed in story_ratings if rating==0)
    print "The most original stories are:"
    for s,rating,feed in story_ratings:
        if rating>0:
            print ("(%d) %s ( %s )"%(rating,s,feed)).encode('utf-8')
    f=open(os.path.expanduser("~/.popurls_alreadyseen.pck"),"wb")
    cPickle.dump((already_seen,distinct_use_days,today,todays_words),f,-1)
    f.close()
    f=open(os.path.expanduser("~/.popurls.pck"),"wb")
    cPickle.dump((time_fetched,story_ratings),f,-1)
    f.close()
    f=open(os.path.expanduser("~/.popurls_alreadyseen_links.pck"),"wb")
    cPickle.dump(already_seen_links,f,-1)
    f.close()

def show_popular_words():
    import cPickle
    import os
    common=set(unicode(open(os.path.join(os.path.dirname(os.path.realpath(__file__)),"common.txt")).read()[:-1],"utf8").split(","))
    a,dummy1,dummy2,dummy3=cPickle.load(open(os.path.expanduser("~/.popurls_alreadyseen.pck")))
    a=a.items()
    a.sort(key=lambda e:e[1][1])
    for k in a:
        if k[0] not in common and k[1][1]>1: print ("%s (%.1f)"%(k[0],k[1][1])).encode('utf-8')
    print "There are %d words in the popular database" % len(a)

def show_oldest_words():
    import time
    already_seen,distinct_use_days,last_use_day,todays_words=get_object_from_file(os.path.expanduser("~/.popurls_alreadyseen.pck"))
    a=already_seen.items()
    a.sort(key=lambda e:-e[1][0])
    n=0
    for k,(t,times_seen) in a:
        if times_seen>=0.02: #filter old typos
            print "%s (%d days)" % (k,distinct_use_days-t)
            n+=1
    print "Showed %d words with some importance sorted by time since last seen in the news" % n

def show_todays_words():
    import time
    already_seen,distinct_use_days,last_use_day,todays_words=get_object_from_file(os.path.expanduser("~/.popurls_alreadyseen.pck"))
    cur=todays_words.items()
    cur.sort(key=lambda e:e[1])
    if cur:
        print "Words of the day:"
        for word,(count,time_added) in cur:
            if count>0:
                print ("%-20s(%d)   (%.2f days)" % (word,count,(time.time()-time_added)/86400)).encode('utf-8')
    else:
        print "No new words today :-("

if __name__=='__main__':
    from sys import argv,exit
    import getopt
    optlist, args = getopt.getopt(argv[1:], '',['popular','today','old'])

    #user values
    for o, a in optlist:
        if o == "--popular":
            show_popular_words()
            sys.exit()
        if o == "--old":
            show_oldest_words()
            sys.exit()
        if o == "--today":
            show_todays_words()
            sys.exit()

    show_original_stuff()

## gruzz
#!/bin/bash
buzz.py > ${HOME}/tmp/buzz.txt
cat ${HOME}/tmp/buzz.txt | grep --color=always -Ei -e "("$(buzz.py --t | cut -f1 -d\ | sed 's/$/[^A-Za-z]/;s/^/[^A-Za-z]/' | tr '\n' '|' | sed "s/.$//")\)
	#!/usr/bin/env python
	# coding=utf8
	#Identify the newest trends
	from __future__ import division
	import os
	import re
	import cPickle
	import time
	import sys

	MAXIMUM_TOTAL_WEIGHT=10000 #Maximum number of word counts. After, apply geometric decay
	REPEAT_INTERVAL_DAYS=30 #Minimum number of days without seing a word required to count it as original again
	FOCUS_DAYS=1 #Number of days a word stays interesting
	my_reddits="programming,technology,linux,xkcd,productivity,Health,newreddits,Physics,c_language,science,business,worldnews,math,Python,startups,bioinformatics,meta,smart,shell".split(",")

	def tokenize(text):
	text=re.sub(u"""[/1234567890=@\-#…«»”“’‘.!"'()*,:;<>?\[\]`{\|}~&]"""," ",text).lower()
	return text.split()

	def add(dict,key):
	dict[key]=dict.get(key,0)+1

	def add_tuple(dict,key,default_value,index=0):
	values=list(dict.get(key,default_value))
	values[index]+=1
	#print "add_tuple test: ",key.encode("utf-8"),values
	dict[key]=tuple(values)

	def get_feed_stories(feeds=["http://digg.com/rss/index.xml","http://reddit.com/r/all/.rss","http://www.lemonde.fr/rss/sequence/0,2-3208,1-0,0.xml","http://linuxfr.org/backend/news-homepage/rss20.rss","http://del.icio.us/rss/","http://www.lefigaro.fr/rss/figaro_actualites.xml","http://news.ycombinator.com/rss","http://linuxfr.org/backend/journaux/rss20.rss","http://www.lepoint.fr/content/system/rss/a_la_une/a_la_une_doc.xml","http://rss.feedsportal.com/c/568/f/7295/index.rss","http://www.marianne2.fr/xml/syndication.rss","http://syndication.lesechos.fr/rss/rss_une.xml","http://blogs.lexpress.fr/attali/index.xml","http://feeds.feedburner.com/consommateur-si-tu-savais","http://www.reddit.com/r/AskReddit/","http://tempsreel.nouvelobs.com/file/rss_perm/rss_permanent.xml","http://top25.sciencedirect.com/rss.php?subject_area_id=17&journal_id=13618415","http://rss.sciencedirect.com/getMessage?registrationId=IHHEIIHEJNHFQHIGKHHLIMJFJLKKLLKJNZJMLPOLMN","http://feedproxy.google.com/Phoronix","http://rss.feedsportal.com/c/499/f/413823/index.rss","http://www.slate.fr/rss.xml"]):
	# disabled: "http://www.liberation.fr/interactif/rss/actualites/",
	import feedparser
	stories=[]
	for r in my_reddits:
	feeds.append("http://reddit.com/r/%s/.rss"%r)
	for f in feeds:
	print "Fetching %s..." % f
	try:
	stories.extend((entry.title,f) for entry in feedparser.parse(f).entries)
	except:
	print "Error parsing %s..." % f
	return stories

	def get_object_from_file(filename,default={}):
	try:
	f=open(filename)
	mydict=cPickle.load(f)
	f.close()
	except IOError,e:
	print "%s file not found, creating a new one..." %filename
	mydict=default
	return mydict

	def downsize_counts(already_seen):
	"""Keeps word counts reasonable using geometric decay, so that new trends don't go unnoticed"""
	total=sum(count for now,count in already_seen.values())
	if total>MAXIMUM_TOTAL_WEIGHT1.01: #1.01 so we don't do it every time
	print "Total count too big (%d), downsizing counts..." % total
	for k,(now,old_count) in already_seen.items():
	already_seen[k]=now,old_count/(total/MAXIMUM_TOTAL_WEIGHT)

	def show_original_stuff():
	now=time.time()
	import datetime
	today=datetime.date.fromtimestamp(now).toordinal()
	already_seen,distinct_use_days,last_use_day,todays_words=get_object_from_file(os.path.expanduser("~/.popurls_alreadyseen.pck"),({},0,0,{}))
	downsize_counts(already_seen)
	already_seen_links=get_object_from_file(os.path.expanduser("~/.popurls_alreadyseen_links.pck"),set())
	time_fetched,story_ratings=get_object_from_file(os.path.expanduser("~/.popurls.pck"),(0,None))

	if story_ratings is None or now-time_fetched>10 * 60: #if file is older than ten minutes
	common=set(unicode(open(os.path.join(os.path.dirname(os.path.realpath(__file__)),"common.txt")).read()[:-1],"utf8").split(","))
	all_stories=get_feed_stories()
	stories=[]
	for s,feed in all_stories:
	if s not in already_seen_links:
	already_seen_links.add(s)
	stories.append((s,feed))
	raw_text=" ".join(s for s,f in stories)
	if not stories:
	print "No new stories found"
	import sys
	sys.exit()
	for word,(count,time_added) in todays_words.items():
	if now-time_added > 86400 * FOCUS_DAYS:
	del todays_words[word]
	if today > last_use_day:
	distinct_use_days+=1
	time_fetched=now
	for word in tokenize(raw_text):
	if word not in common and len(word)>=2:#Common words don't interest us
	if word in already_seen:
	if word in todays_words:
	add_tuple(todays_words,word,default_value=(0,now),index=0)
	already_seen[word]=distinct_use_days,already_seen[word][1]+1 #this word is still being seen
	else:
	already_seen[word]=distinct_use_days,1
	add_tuple(todays_words,word,default_value=(0,now),index=0)
	#compute rated stories
	story_ratings=[ (story,int(100*sum(todays_words[w][0] for w in story_words \
	if w in todays_words)/len(story_words)),feed)\
	for story,story_words,feed in ((s,tokenize(s),feed) for s,feed in stories)\
	if len(story_words)>0]
	story_ratings.sort(key=lambda e:e[1])

	for k,(t,dummy) in already_seen.items(): #If a keyword hasn't been seen in a month, it's interesting again
	if distinct_use_days-t>REPEAT_INTERVAL_DAYS:
	print ("Cleanup: removed %s from seen dictionnary" % k).encode('utf-8')
	del already_seen[k]
	print "Eliminated %d unoriginal stories" % sum(1 for s,rating,feed in story_ratings if rating==0)
	print "The most original stories are:"
	for s,rating,feed in story_ratings:
	if rating>0:
	print ("(%d) %s ( %s )"%(rating,s,feed)).encode('utf-8')
	f=open(os.path.expanduser("~/.popurls_alreadyseen.pck"),"wb")
	cPickle.dump((already_seen,distinct_use_days,today,todays_words),f,-1)
	f.close()
	f=open(os.path.expanduser("~/.popurls.pck"),"wb")
	cPickle.dump((time_fetched,story_ratings),f,-1)
	f.close()
	f=open(os.path.expanduser("~/.popurls_alreadyseen_links.pck"),"wb")
	cPickle.dump(already_seen_links,f,-1)
	f.close()

	def show_popular_words():
	import cPickle
	import os
	common=set(unicode(open(os.path.join(os.path.dirname(os.path.realpath(__file__)),"common.txt")).read()[:-1],"utf8").split(","))
	a,dummy1,dummy2,dummy3=cPickle.load(open(os.path.expanduser("~/.popurls_alreadyseen.pck")))
	a=a.items()
	a.sort(key=lambda e:e[1][1])
	for k in a:
	if k[0] not in common and k[1][1]>1: print ("%s (%.1f)"%(k[0],k[1][1])).encode('utf-8')
	print "There are %d words in the popular database" % len(a)

	def show_oldest_words():
	import time
	already_seen,distinct_use_days,last_use_day,todays_words=get_object_from_file(os.path.expanduser("~/.popurls_alreadyseen.pck"))
	a=already_seen.items()
	a.sort(key=lambda e:-e[1][0])
	n=0
	for k,(t,times_seen) in a:
	if times_seen>=0.02: #filter old typos
	print "%s (%d days)" % (k,distinct_use_days-t)
	n+=1
	print "Showed %d words with some importance sorted by time since last seen in the news" % n

	def show_todays_words():
	import time
	already_seen,distinct_use_days,last_use_day,todays_words=get_object_from_file(os.path.expanduser("~/.popurls_alreadyseen.pck"))
	cur=todays_words.items()
	cur.sort(key=lambda e:e[1])
	if cur:
	print "Words of the day:"
	for word,(count,time_added) in cur:
	if count>0:
	print ("%-20s(%d) (%.2f days)" % (word,count,(time.time()-time_added)/86400)).encode('utf-8')
	else:
	print "No new words today :-("

	if __name__=='__main__':
	from sys import argv,exit
	import getopt
	optlist, args = getopt.getopt(argv[1:], '',['popular','today','old'])

	#user values
	for o, a in optlist:
	if o == "--popular":
	show_popular_words()
	sys.exit()
	if o == "--old":
	show_oldest_words()
	sys.exit()
	if o == "--today":
	show_todays_words()
	sys.exit()

	show_original_stuff()
	#!/bin/bash
	buzz.py > ${HOME}/tmp/buzz.txt
	cat ${HOME}/tmp/buzz.txt \| grep --color=always -Ei -e "("$(buzz.py --t \| cut -f1 -d\ \| sed 's/$/[^A-Za-z]/;s/^/[^A-Za-z]/' \| tr '\n' '\|' \| sed "s/.$//")\)