mmanylov-zz/main.py

## main.py
# see https://www.blopig.com/blog/2016/08/processing-large-files-using-python/
import multiprocessing as mp
import os
import re
from datetime import datetime, timedelta

RESULT_FILENAME = 'result.csv'
FILENAME_TEMPLATE ='log-{date}'
CHUNK_SIZE = 1024*1024
log_pattern = re.compile(r'^SEVERITY.*(?<=first param in quotes\s")(.*)(?=").*(?<=second param\s)(\w+).*?(third param)?$')
manager = mp.Manager()
result = manager.list()


def write_result_csv(result):
    work_f = open(RESULT_FILENAME, "w")
    work_f.write('="first param";="second param";="third param"\n')
    for line in result:
        result = map(lambda x: f'="{x}"', line)
        work_f.write(';'.join(result)+"\n")
    work_f.close()


def get_logfile_name():
    yesterday_str = datetime.strftime(datetime.now() - timedelta(1), '%Y%m%d')
    name = FILENAME_TEMPLATE.format(date=yesterday_str)

    return name


def process(line):
    pass


def worker(filename, chunk_start, chunk_size):
    with open(filename, 'r') as f:
        f.seek(chunk_start)
        lines = f.read(chunk_size).splitlines()
        for line in lines:
            process(line)


def chunkify(fname, size=CHUNK_SIZE):
    file_end = os.path.getsize(fname)
    with open(fname, 'rb') as f:
        chunk_end = f.tell()
        while True:
            chunk_start = chunk_end
            f.seek(size, 1)
            f.readline()
            chunk_end = f.tell()
            yield chunk_start, chunk_end - chunk_start
            if chunk_end > file_end:
                break


pool = mp.Pool(mp.cpu_count())

jobs = []
filename = get_logfile_name()
for chunk_start, chunk_size in chunkify(filename):
    jobs.append(pool.apply_async(worker, (filename, chunk_start, chunk_size)))

for job in jobs:
    job.get()


pool.close()
pool.join()

write_result_csv(result)
	# see https://www.blopig.com/blog/2016/08/processing-large-files-using-python/
	import multiprocessing as mp
	import os
	import re
	from datetime import datetime, timedelta

	RESULT_FILENAME = 'result.csv'
	FILENAME_TEMPLATE ='log-{date}'
	CHUNK_SIZE = 1024*1024
	log_pattern = re.compile(r'^SEVERITY.(?<=first param in quotes\s")(.)(?=").(?<=second param\s)(\w+).?(third param)?$')
	manager = mp.Manager()
	result = manager.list()


	def write_result_csv(result):
	work_f = open(RESULT_FILENAME, "w")
	work_f.write('="first param";="second param";="third param"\n')
	for line in result:
	result = map(lambda x: f'="{x}"', line)
	work_f.write(';'.join(result)+"\n")
	work_f.close()


	def get_logfile_name():
	yesterday_str = datetime.strftime(datetime.now() - timedelta(1), '%Y%m%d')
	name = FILENAME_TEMPLATE.format(date=yesterday_str)

	return name


	def process(line):
	pass


	def worker(filename, chunk_start, chunk_size):
	with open(filename, 'r') as f:
	f.seek(chunk_start)
	lines = f.read(chunk_size).splitlines()
	for line in lines:
	process(line)


	def chunkify(fname, size=CHUNK_SIZE):
	file_end = os.path.getsize(fname)
	with open(fname, 'rb') as f:
	chunk_end = f.tell()
	while True:
	chunk_start = chunk_end
	f.seek(size, 1)
	f.readline()
	chunk_end = f.tell()
	yield chunk_start, chunk_end - chunk_start
	if chunk_end > file_end:
	break


	pool = mp.Pool(mp.cpu_count())

	jobs = []
	filename = get_logfile_name()
	for chunk_start, chunk_size in chunkify(filename):
	jobs.append(pool.apply_async(worker, (filename, chunk_start, chunk_size)))

	for job in jobs:
	job.get()


	pool.close()
	pool.join()

	write_result_csv(result)