Skip to content

Instantly share code, notes, and snippets.

@motiejus
Last active September 30, 2015 20:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save motiejus/1861398 to your computer and use it in GitHub Desktop.
Save motiejus/1861398 to your computer and use it in GitHub Desktop.
Most popular discussions in Erlang-questions
*.txt.gz
all.txt
filelist.mk
-include filelist.mk
.PHONY: all clean distclean
all: all.txt
./subjects.py $<
all.txt: filelist.mk $(gzs)
gzip -cd $(gzs) > $@
filelist.mk:
echo 'gzs = \\' > $@
curl http://erlang.org/pipermail/erlang-questions/ |\
perl -ne 's/.*"(\d\d\d\d-\w+.txt.gz)".*/\t\1 \\/ && print $$_' >> $@
%.txt.gz:
wget -q --timeout=5 http://erlang.org/pipermail/erlang-questions/$@
clean:
rm -f filelist.mk all.txt
distclean:
rm -f filelist.mk all.txt *.txt.gz
#!/usr/bin/python
"""
Used to parse N most popular subjects from erlang-questions mailing list
First, data must be fetched:
curl http://erlang.org/pipermail/erlang-questions/ |\
perl -ne 's/.*"(\d\d\d\d-\w+.txt.gz)".*/\1/ && print $_' |\
xargs -I{} curl http://erlang.org/pipermail/erlang-questions/{} |\
gzip -d >> all.txt
Then start the following script with the output:
./subjects.py all.txt
"""
from mailbox import mbox
from email.utils import mktime_tz, parsedate_tz
from time import strftime, gmtime
def get_data(f):
stats = {}
prev_t = 0
for email in mbox(f):
subj = str(email.get('Subject'))
if subj:
subj = subj.strip().replace("[erlang-questions] ", "")
else:
continue
d = email.get('Date')
t = mktime_tz(parsedate_tz(d)) if d else prev_t
prev_t = t
kv = {'subj' : subj, 'min' : t, 'max' : t, 'occ' : 0}
mail = stats.get(subj, kv)
mail['occ'] += 1
mail['min'] = min(mail['min'], t)
mail['max'] = max(mail['max'], t)
stats[subj] = mail
return stats.values()
import sys
if __name__ == '__main__':
values = get_data(sys.argv[1])
top = sorted(values, key = lambda k: k['occ'], reverse=True)
d = lambda f: strftime('%F', gmtime(f))
for n, i in enumerate(top[:25]):
print('%3d | %3d | %s - %s | %s' % \
(n+1, i['occ'], d(i['min']), d(i['max']), i['subj']))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment