Skip to content

Instantly share code, notes, and snippets.

@vls
Created February 21, 2012 03:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vls/1873402 to your computer and use it in GitHub Desktop.
Save vls/1873402 to your computer and use it in GitHub Desktop.
test script for re2 and re2
#!/usr/bin/env python
#encoding=utf-8
import time
import os, sys
import re
import pickle
import re2
re2.set_fallback_notification(re2.FALLBACK_WARNING)
FLAG_RE2 = False
FORCE_UNICODE = True
REGEX_DICT = {}
REGEX_DICT['fengting1'] = []
REGEX_DICT['fengting2'] = []
REGEX_DICT['geli'] = []
REGEX_DICT['jinyan'] = []
REGEX_DICT['nick'] = []
REGEX_DICT['warning'] = []
REGEX_DICT['exc'] = []
global_arr = [
# ('regex/fengting1.txt', 'fengting1', u'封停'),
# ('regex/fengting2.txt', 'fengting2', u'封停'),
# ('regex/geli.txt', 'geli', u'隔离'),
# ['regex/clear_jinyan.txt', 'jinyan', u'禁言'],
['regex/clear_warning.txt', 'warning', u'警告'],
# ('regex/nickname.txt', 'nick', u'昵称'),
#('regex/exception.txt', 'exc', u'例外'),
]
REG_DUMPS = 'regex.dumps'
def load():
global REGEX_DICT
mod_re = re2 if FLAG_RE2 else re
print "Module = %s" % (mod_re)
count_re = 0
count_re2 = 0
for fname, key, _ in global_arr:
l = REGEX_DICT[key]
for line in open(fname):
line = line.strip()
if FORCE_UNICODE:
line = line.decode('utf-8')
regex = mod_re.compile(line)
l.append((regex, line))
if isinstance(regex, re2.Pattern):
count_re2 += 1
else:
count_re += 1
assert l
print "count re = %d, re2 = %d" % (count_re, count_re2)
total_time = 0
def match(nick, msg):
global REGEX_DICT
global total_time
for fname, key, desc in global_arr:
#flag_nick = 'nickname' in fname
l = REGEX_DICT[key]
for regex, pat in l:
# if not isinstance(regex, re2.Pattern):
# # Python re module, need unicode
# if not isinstance(nick, unicode):
# nick = nick.decode('utf-8')
#
# if not isinstance(msg, unicode):
# msg = msg.decode('utf-8')
#if flag_nick:
# if regex.search(nick):
# return True, desc, pat
# continue
if regex.search(msg):
return True, desc, pat
return False, '', ''
REGEX_FILTER = re.compile(r'#c[0-9a-fA-F]{6}|#[RGBKYWbunr#]|#\d{1,3}')
OUTPUT_DIR = 'output'
try:
os.makedirs(OUTPUT_DIR)
except:
pass
def export_fname(fname):
line_dict = {}
servername = os.path.basename(os.path.dirname(fname))
if not servername:
servername = 'NA'
basename = os.path.basename(fname)
count = 0
match_count = 0
oname = 'plain-%s-%s.result.txt' % (servername, basename)
oname = os.path.join(OUTPUT_DIR, oname)
with open(oname, 'w') as wf:
for line in open(fname):
count += 1
line = line.strip()
if FORCE_UNICODE:
line =line.decode('utf-8')
arr = line.split(None, 7)
chan = arr[3]
#if u'INFO|CHAT' not in chan:
# continue
#if u'世界' in chan:
# continue
try:
nick = arr[5]
msg = arr[-1]
msg = REGEX_FILTER.sub('', msg)
flag, desc, pat = match(nick, msg)
if flag:
ret_arr = [line, desc, pat ]
us = '\t'.join(ret_arr)
if FORCE_UNICODE:
us = us.encode('utf-8')
wf.write(us)
wf.write('\n')
match_count += 1
except IndexError:
continue
print "match count = %s" % (match_count)
def run_re2_test():
sys.argv = sys.argv[:1]
sys.argv.extend(['test-utf-8.log', '1'])
main()
def run_re_test():
sys.argv = sys.argv[:1]
sys.argv.extend(['test-utf-8.log'])
main()
def main():
if len(sys.argv) <= 1:
print 'Usage: %s <file>' % (sys.argv[0])
sys.exit(0)
if len(sys.argv) >= 3:
print 'using re2!!!'
global FLAG_RE2, FORCE_UNICODE
FLAG_RE2 = True
FORCE_UNICODE = False
global global_arr
for i, (_, _, name) in enumerate(global_arr):
if not FORCE_UNICODE:
global_arr[i][2] = name.encode('utf-8')
t0 = time.time()
load()
print >> sys.stderr, 'load complete. time = %s' % (time.time() - t0)
t0 = time.time()
if os.path.isdir(sys.argv[1]):
import glob
import multiprocessing
fnames = glob.glob("%s/*/*.log" % sys.argv[1])
pool = multiprocessing.Pool(multiprocessing.cpu_count() - 1)
pool.map_async(export_fname, fnames)
pool.close()
pool.join()
else:
export_fname(sys.argv[1])
print >> sys.stderr, 'process time = %s' % (time.time() - t0)
print >> sys.stderr, 'total process time = %s' % (total_time)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment