Step 1: run so-splitter-Posts.py (config: srcPath, dstPath, tagsPattern)
Step 2: run so-splitter-PostLinks.py (config: srcPath, dstPath)
import json | |
import xml.parsers.expat | |
srcPath='E:/tmp/so' | |
dstPath='E:/tmp/so-extracted' | |
post_ids=[] | |
question_file=open(dstPath+'/Questions.xml','rb') | |
answer_file=open(dstPath+'/Answers.xml','rb') | |
questions=json.loads(question_file.read()) | |
answers=json.loads(answer_file.read()) | |
for question in questions: | |
post_ids.append(question['Id']) | |
for answer in answers: | |
post_ids.append(answer['Id']) | |
link_in_file=open(srcPath+'/PostLinks.xml','rb') | |
links=[] | |
def link_element_handler(name, attrs): | |
if 'PostId' in attrs and attrs['PostId'] in post_ids and attrs['RelatedPostId'] in post_ids: | |
links.append(attrs) | |
print('Link '+attrs['Id']+'('+str(len(links))+')') | |
link_parser=xml.parsers.expat.ParserCreate() | |
link_parser.StartElementHandler=link_element_handler | |
link_parser.ParseFile(link_in_file) | |
link_out_file=open(dstPath+'/PostLinks.xml','a',encoding='utf-8') | |
link_out_file.write(json.dumps(links)) | |
link_out_file.close() | |
link_in_file.close() |
import xml.parsers.expat | |
import re,json | |
srcPath='E:/tmp/so' | |
dstPath='E:/tmp/so-extracted' | |
tagsPattern='<lucene>' | |
post_in_file=open(srcPath+'/Posts.xml','rb') | |
question_ids=[] | |
answer_ids=[] | |
questions=[] | |
answers=[] | |
def post_element_handler(name, attrs): | |
if 'PostTypeId' in attrs: | |
if attrs['PostTypeId']=='1' and re.search(tagsPattern, attrs['Tags'])!=None: | |
question_ids.append(attrs['Id']) | |
questions.append(attrs) | |
print('Question '+attrs['Id']+'('+str(len(questions))+')') | |
if attrs['PostTypeId']=='2' and attrs['ParentId'] in question_ids: | |
answer_ids.append(attrs['Id']) | |
answers.append(attrs) | |
post_parser=xml.parsers.expat.ParserCreate() | |
post_parser.StartElementHandler=post_element_handler | |
post_parser.ParseFile(post_in_file) | |
question_out_file=open(dstPath+'/Questions.xml','a',encoding='utf-8') | |
answer_out_file=open(dstPath+'/Answers.xml','a',encoding='utf-8') | |
question_out_file.write(json.dumps(questions)) | |
answer_out_file.write(json.dumps(answers)) | |
question_out_file.close() | |
answer_out_file.close() | |
post_in_file.close() |