Skip to content

Instantly share code, notes, and snippets.

@myl7
Last active May 1, 2021 16:59
Show Gist options
  • Save myl7/6118aae43eeccddd47dcc98fc60115ae to your computer and use it in GitHub Desktop.
Save myl7/6118aae43eeccddd47dcc98fc60115ae to your computer and use it in GitHub Desktop.
Analyze rsyncd log with multiprocessing and handle log file UTF-8 error. Based on https://gist.github.com/taoky/91c12185c2cd38f264fe2863a6b13c27 .
import sys
from collections import defaultdict
from multiprocessing import Pool
import json
import glob
try:
import yaml
dumper = yaml.dump
except ImportError:
dumper = json.dumps
JOBS = 14
LOG_GLOB = 'rsyncd.log-*'
def analyze(line):
state = 0
for idx, c in enumerate(line):
if state == 0:
if c == ']':
if line[idx + 2:idx + 6] == 'send':
state = 1
else:
return None
elif state == 1:
if c == ']':
if line[idx + 2:idx + 6] == 'repo':
state = 2
else:
pos = line[idx:].find('(')
return line[idx + 2:idx + pos - 1]
elif state == 2:
if c == ')':
pos = line[idx:].find('/')
if pos < 0:
raise ValueError
return line[idx + 2:idx + pos]
def safe_iter(f):
while True:
try:
yield from f
break
except Exception as e:
print(e)
def main():
files = glob.glob(LOG_GLOB)
files.sort()
counters = defaultdict(int)
with Pool(JOBS) as pool:
for file in files:
with open(file) as f:
for res in pool.map(analyze, safe_iter(f)):
if res is None:
continue
counters[res] += 1
print(f'Analyze {file} OK')
if len(sys.argv) > 1:
file = sys.argv[1]
with open(file, 'w') as f:
f.write(dumper(counters))
else:
print(dumper(counters))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment