Skip to content

Instantly share code, notes, and snippets.

Created October 9, 2011 09:58
Show Gist options
  • Save jackywyz/1273514 to your computer and use it in GitHub Desktop.
Save jackywyz/1273514 to your computer and use it in GitHub Desktop.
analytics nginx logs with python
# -*- coding: utf-8 -*-
__version__ = '0.2.0'
#crontab -e mting_rank.py统计周期:
# 0 6 1 * *
# 0 4 * * 1
import os
import fileinput
import re
from tornado import web,ioloop,httpclient as http
import xlwt
import xlrd
from xlutils.copy import copy
import datetime
ezxf = xlwt.easyxf
dir_log = r"/logs"
songurl = "**********************************"
ipP = r"?P<ip>[\d.]*";
#[21/Jan/2011:15:04:41 +0800]
timeP = r"""?P<time>\[ #以[开始
[^\[\]]* #除[]以外的任意字符 防止匹配上下个[]项目(也可以使用非贪婪匹配*?) 不在中括号里的.可以匹配换行外的任意字符 *这样地重复是"贪婪的“ 表达式引擎会试着重复尽可能多的次数。
\] #以]结束
requestP = r"""?P<request>\" #以"开始
[^\"]* #除双引号以外的任意字符 防止匹配上下个""项目(也可以使用非贪婪匹配*?)
\" #以"结束
rests = r""".+"""
nginxLogPattern = re.compile(r"(%s)\ -\ -\ (%s)\ (%s)\ (%s)" %(ipP, timeP, requestP, rests), re.VERBOSE)
def getValue(value):
return value is not None and value.text or '0'
def getBody(url):
client = http.HTTPClient()
res = client.fetch(url)
return res.body
except http.HTTPError,e:
print "Error:",e
def write_xls(file_name, sheet_name, headings, data, heading_xf, data_xfs):
book = None
book = xlrd.open_workbook(file_name+".swf",formatting_info=True)
book = copy(book)
except IOError:
book = xlwt.Workbook(encoding="utf-8")
sheet = book.add_sheet(sheet_name)
rowx = 0
for colx, value in enumerate(headings):
sheet.write(rowx, colx, unicode(value, 'utf-8'), heading_xf)
sheet.set_panes_frozen(True) # frozen headings instead of split panes
sheet.set_horz_split_pos(rowx+1) # in general, freeze after last heading row
sheet.set_remove_splits(True) # if user does unfreeze, don't leave a split there
for row in data:
rowx += 1
for colx, value in enumerate(row):
sheet.write(rowx, colx, value, data_xfs[colx])
def processDir(dir_proc):
os.popen('gzip -d ' + dir_proc + '/*.gz')
date =
includes = []
if == 1:
includes = includes = filter(lambda x: (date.month ==1 and str(date.year-1) +'12' or datetime.datetime(date.year,date.month-1,'%Y%m')) in x, os.listdir(dir_proc))
if date.weekday() == 0:
b = date - datetime.timedelta(date.weekday() + 1)
for i in range(6, -1, -1):
c = b-datetime.timedelta(i)
for file in includes:
if os.path.isdir(os.path.join(dir_proc, file)):
print "WARN:%s is a directory" %(file)
processDir(os.path.join(dir_proc, file))
if file.endswith(".gz"):
print "WARN:%s is not a log file" %(file)
print "INFO:process file %s" %(file)
for line in fileinput.input(os.path.join(dir_proc, file)):
matchs = nginxLogPattern.match(line)
if matchs!=None:
allGroups = matchs.groups()
ip = allGroups[0]
time = allGroups[1]
request = allGroups[2]
songInfo = request.split("/")[-2]
raise Exception
allSongDict = {}
reportInfos = {}
def GetResponseStatusCount(songId,ip):
if allSongDict.has_key(songId):
allSongDict[songId] = set();
def formatLog(info):
body = getBody(songurl+info[0])
import xml.etree.ElementTree as ET
root = ET.fromstring(body)
title = getValue(root.find("song/title"))
artists = root.findall("song/artists/artist")
for artist in artists:
singerNames += getValue(artist.find("name"))+', '
return [title,singerNames,len(info[1])]
def reportLog(count):
reports = []
for i in sorted(allSongDict.items(), key=lambda d:len(d[1]), reverse=True)[0:count]:
#reportInfos[i[0]] = formatLog(i)
reports += [[i[0]] + formatLog(i)]
hdngs = ['歌曲ID','歌曲名','歌手', '用户数']
kinds = 'text text text int'.split()
heading_xf = ezxf('font: bold on; align: wrap on, vert centre, horiz center')
kind_to_xf_map = {
'text': ezxf(),
'text': ezxf(),
'text': ezxf(),
'int': ezxf(num_format_str='#,##0'),
data_xfs = [kind_to_xf_map[k] for k in kinds]
date =
time = date.strftime("%Y%m%d")
filename = '/report/rank_'
if == 1:
filename = filename + str(date.month==1 and date.year-1 or date.year)
write_xls(filename, time, hdngs, reports, heading_xf, data_xfs)
if date.weekday() == 0:
filename = filename + date.strftime("%Y%m")
write_xls(filename , time, hdngs, reports, heading_xf, data_xfs)
os.popen('mv '+filename + ' '+filename+'.swf')
if __name__ == "__main__":
#print reportInfos
print "done, python is great!"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment