Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Python script to extract tags that co-occur with a given tag on question-type posts in Posts.xml file from Stack Overflow data dump.
#
# Copyright (c) 2017, Venkatesh-Prasad Ranganath
#
# BSD 3-clause License
#
# Author: Venkatesh-Prasad Ranganath
#
import argparse
import datetime
import itertools
import queue
import re
import threading
from concurrent import futures
NUM_PROCS = 7
CHUNK_SIZE = 5000
JOB_QUEUE_SIZE = NUM_PROCS * 2
def process_lines(lines, anchor_tag):
pattern1 = re.compile(r'.* PostTypeId="1" .* Tags="([^"]*)" .*')
re_string = r'(.*)<{0}>(.*)'.format(anchor_tag)
pattern2 = re.compile(re_string)
matches1 = map(lambda l: pattern1.match(l), lines)
non_empty_matches1 = filter(lambda t: t, matches1)
matches2 = map(lambda m: pattern2.match(m.group(1)), non_empty_matches1)
non_empty_matches2 = filter(lambda t: t, matches2)
co_occurring_tagsets = map(lambda m: m.group(1) + m.group(2),
non_empty_matches2)
non_empty_co_occuring_tagsets = filter(lambda t: t, co_occurring_tagsets)
tag_sets = map(lambda s: s.replace(">", "").
replace("<", ",")[1:].split(','),
non_empty_co_occuring_tagsets)
return list(itertools.chain.from_iterable(tag_sets))
def create_jobs(posts_file_name, anchor_tag, queue):
with open(posts_file_name, "rt") as posts_file:
with futures.ProcessPoolExecutor(max_workers=NUM_PROCS) as exe:
lines = []
for i, line in enumerate(posts_file):
lines.append(line)
if i % CHUNK_SIZE == 0:
queue.put(exe.submit(process_lines, lines, anchor_tag),
True)
lines = []
queue.put(exe.submit(process_lines, lines, anchor_tag), True)
print("Done creating jobs")
queue.put(False)
if __name__ == "__main__":
cli_parser = argparse.ArgumentParser(
description='Extract tags that occur with given tag in Stack \
Overflow posts')
cli_parser.add_argument('-p', '--posts_file', type=str, required=True,
help='Stack Overflow Posts XML file')
cli_parser.add_argument('-a', '--anchor_tag', type=str, required=True,
help='Tag with which other tags should co-occur')
options = cli_parser.parse_args()
queue = queue.Queue(JOB_QUEUE_SIZE)
job_creator = threading.Thread(target=create_jobs,
args=(options.posts_file,
options.anchor_tag, queue))
job_creator.start()
cooccurring_tags = set()
i = 0
while True:
future = queue.get()
if not future:
queue.task_done()
break
cooccurring_tags.update(future.result())
queue.task_done()
if (i % 100 == 0):
print(i * CHUNK_SIZE, len(cooccurring_tags),
datetime.datetime.now())
i += 1
job_creator.join()
cooccurring_tags.add(options.anchor_tag.strip())
tmp1 = "tags-occurring-with-{0}-tag.txt".format(options.anchor_tag)
with open(tmp1, "wt") as tags_file:
for tag in sorted(list(cooccurring_tags)):
tags_file.writelines("{0}\n".format(tag))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment