Skip to content

Instantly share code, notes, and snippets.

@linzeqipku
Last active February 16, 2018 04:56
Show Gist options
  • Save linzeqipku/87089f707dadaf23a33fdb91877bcff2 to your computer and use it in GitHub Desktop.
Save linzeqipku/87089f707dadaf23a33fdb91877bcff2 to your computer and use it in GitHub Desktop.
Extract questions and answers from StackOverflow dump files by tags

Step 1: run so-splitter-Posts.py (config: srcPath, dstPath, tagsPattern)

Step 2: run so-splitter-PostLinks.py (config: srcPath, dstPath)

import xml.parsers.expat
import re,json
srcPath='E:/tmp/so'
dstPath='E:/tmp/so-extracted'
tagsPattern='<lucene>'
post_in_file=open(srcPath+'/Posts.xml','rb')
question_ids=[]
answer_ids=[]
questions=[]
answers=[]
def post_element_handler(name, attrs):
if 'PostTypeId' in attrs:
if attrs['PostTypeId']=='1' and re.search(tagsPattern, attrs['Tags'])!=None:
question_ids.append(attrs['Id'])
questions.append(attrs)
print('Question '+attrs['Id']+'('+str(len(questions))+')')
if attrs['PostTypeId']=='2' and attrs['ParentId'] in question_ids:
answer_ids.append(attrs['Id'])
answers.append(attrs)
post_parser=xml.parsers.expat.ParserCreate()
post_parser.StartElementHandler=post_element_handler
post_parser.ParseFile(post_in_file)
question_out_file=open(dstPath+'/Questions.xml','a',encoding='utf-8')
answer_out_file=open(dstPath+'/Answers.xml','a',encoding='utf-8')
question_out_file.write(json.dumps(questions))
answer_out_file.write(json.dumps(answers))
question_out_file.close()
answer_out_file.close()
post_in_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment