|
# |
|
# Copyright (c) 2017, Venkatesh-Prasad Ranganath |
|
# |
|
# BSD 3-clause License |
|
# |
|
# Author: Venkatesh-Prasad Ranganath |
|
# |
|
import argparse |
|
import datetime |
|
import itertools |
|
import queue |
|
import re |
|
import threading |
|
from concurrent import futures |
|
|
|
|
|
NUM_PROCS = 7 |
|
CHUNK_SIZE = 5000 |
|
JOB_QUEUE_SIZE = NUM_PROCS * 2 |
|
|
|
|
|
def process_lines(lines, anchor_tag): |
|
pattern1 = re.compile(r'.* PostTypeId="1" .* Tags="([^"]*)" .*') |
|
re_string = r'(.*)<{0}>(.*)'.format(anchor_tag) |
|
pattern2 = re.compile(re_string) |
|
matches1 = map(lambda l: pattern1.match(l), lines) |
|
non_empty_matches1 = filter(lambda t: t, matches1) |
|
matches2 = map(lambda m: pattern2.match(m.group(1)), non_empty_matches1) |
|
non_empty_matches2 = filter(lambda t: t, matches2) |
|
co_occurring_tagsets = map(lambda m: m.group(1) + m.group(2), |
|
non_empty_matches2) |
|
non_empty_co_occuring_tagsets = filter(lambda t: t, co_occurring_tagsets) |
|
tag_sets = map(lambda s: s.replace(">", ""). |
|
replace("<", ",")[1:].split(','), |
|
non_empty_co_occuring_tagsets) |
|
return list(itertools.chain.from_iterable(tag_sets)) |
|
|
|
|
|
def create_jobs(posts_file_name, anchor_tag, queue): |
|
with open(posts_file_name, "rt") as posts_file: |
|
with futures.ProcessPoolExecutor(max_workers=NUM_PROCS) as exe: |
|
lines = [] |
|
for i, line in enumerate(posts_file): |
|
lines.append(line) |
|
if i % CHUNK_SIZE == 0: |
|
queue.put(exe.submit(process_lines, lines, anchor_tag), |
|
True) |
|
lines = [] |
|
queue.put(exe.submit(process_lines, lines, anchor_tag), True) |
|
print("Done creating jobs") |
|
queue.put(False) |
|
|
|
|
|
if __name__ == "__main__": |
|
cli_parser = argparse.ArgumentParser( |
|
description='Extract tags that occur with given tag in Stack \ |
|
Overflow posts') |
|
cli_parser.add_argument('-p', '--posts_file', type=str, required=True, |
|
help='Stack Overflow Posts XML file') |
|
cli_parser.add_argument('-a', '--anchor_tag', type=str, required=True, |
|
help='Tag with which other tags should co-occur') |
|
options = cli_parser.parse_args() |
|
|
|
queue = queue.Queue(JOB_QUEUE_SIZE) |
|
job_creator = threading.Thread(target=create_jobs, |
|
args=(options.posts_file, |
|
options.anchor_tag, queue)) |
|
job_creator.start() |
|
cooccurring_tags = set() |
|
i = 0 |
|
while True: |
|
future = queue.get() |
|
if not future: |
|
queue.task_done() |
|
break |
|
cooccurring_tags.update(future.result()) |
|
queue.task_done() |
|
if (i % 100 == 0): |
|
print(i * CHUNK_SIZE, len(cooccurring_tags), |
|
datetime.datetime.now()) |
|
i += 1 |
|
job_creator.join() |
|
|
|
cooccurring_tags.add(options.anchor_tag.strip()) |
|
tmp1 = "tags-occurring-with-{0}-tag.txt".format(options.anchor_tag) |
|
with open(tmp1, "wt") as tags_file: |
|
for tag in sorted(list(cooccurring_tags)): |
|
tags_file.writelines("{0}\n".format(tag)) |