Skip to content

Instantly share code, notes, and snippets.

@salexkidd
Created November 19, 2015 04:01
Show Gist options
  • Save salexkidd/114e62e4e05becc0a33f to your computer and use it in GitHub Desktop.
Save salexkidd/114e62e4e05becc0a33f to your computer and use it in GitHub Desktop.
html file parsing
import pathlib
import multiprocessing
from more_itertools import chunked
from bs4 import BeautifulSoup
HTML_DIR = "./html"
PROCESS_NUM = 8
def parser(file_list, process_num):
print("PID: {} list_count:".format(process_num, len(file_list)))
for html_file in file_list:
with html_file.open(mode="r") as fh:
html_str = fh.read()
bs_obj = BeautifulSoup(html_str, "lxml")
print(type(bs_obj))
def chunked_file_list(path, chunked_num):
path_obj = pathlib.Path(path)
file_list = list(path_obj.glob("*.html"))
split_count = int(len(file_list) / chunked_num)
return chunked(file_list, split_count)
def main():
process_list = list()
multi = True
for i, file_list in enumerate(chunked_file_list(HTML_DIR, PROCESS_NUM)):
if multi:
p = multiprocessing.Process(target=parser, args=(file_list, i))
process_list.append(p)
else:
parser(file_list, i)
if multi:
for p in process_list:
p.start()
for p in process_list:
p.join()
print("That's all!")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment