This script parse the data of Posts.xml, base of questions and answer of StackOverflow system. In this website has all base publics: Data Stack Exchange
Rodrigo Reis - @digoreis
This script parse the data of Posts.xml, base of questions and answer of StackOverflow system. In this website has all base publics: Data Stack Exchange
Rodrigo Reis - @digoreis
import xml.etree.ElementTree as ET | |
import html | |
import numpy as np | |
import pandas as pd | |
import re | |
import sys | |
namePostFile = sys.argv[1] | |
nameExportQuestions = sys.argv[2] | |
nameExportAnswer = sys.argv[3] | |
if namePostFile == "" or nameExportQuestions == "" or nameExportAnswer == "" : | |
print('stkparser.py <Post.xml> <QuestionsOutput> <AnswerOutput>') | |
sys.exit(2) | |
def cleanfile(fileName): | |
f = open(fileName, 'w') | |
f.close() | |
def write(question, answer): | |
with open(nameExportQuestions, 'a') as q: | |
q.write(question) | |
q.write("\n") | |
with open(nameExportAnswer, 'a') as q: | |
q.write(answer) | |
q.write("\n") | |
def cleanhtml(raw_html): | |
cleanr = re.compile('<.*?>') | |
cleantext = re.sub(cleanr, '', raw_html) | |
return cleantext | |
def xml2df(xml_data): | |
root = ET.XML(xml_data) # element tree | |
all_records = [] | |
all_records_complete = [] | |
answers = {} | |
size = len(root) | |
for i, child in enumerate(root): | |
record = {} | |
print("\rLines of Post.xml :: [{0}/{1}] => {2}%".format(i, size,np.around((i * 100) / size) ), end='') | |
if child.attrib["PostTypeId"] == "1" and "AcceptedAnswerId" in child.attrib.keys() : | |
record["ID"] = child.attrib["Id"] | |
record["QUESTION"] = cleanhtml(html.unescape(child.attrib["Body"])).replace('\n', ' ') | |
all_records.append(record) | |
if child.attrib["PostTypeId"] == "2" : | |
answers[child.attrib["ParentId"]] = cleanhtml(html.unescape(child.attrib["Body"])).replace('\n', ' ') | |
size = len(all_records) | |
print("") | |
for i, item in enumerate(all_records): | |
print("\rProcessing questions :: [{0}/{1}] => {2}%".format(i, size,np.around((i * 100) / size) ), end='') | |
if item["ID"] in answers.keys(): | |
record = {} | |
record["QUESTION"] = item["QUESTION"] | |
record["ANSWER"] = answers[item["ID"]] | |
all_records_complete.append(record) | |
print("") | |
return pd.DataFrame(all_records_complete) | |
print("Cleanning export files") | |
cleanfile(nameExportQuestions) | |
cleanfile(nameExportAnswer) | |
print("Loading Post.xml file") | |
xml_data = open(namePostFile).read() | |
data = xml2df(xml_data) | |
size = len(data) | |
for item in data.itertuples(): | |
print("\rLines writes in files :: [{0}/{1}] => {2}%".format(item[0], size,np.around((item[0] * 100) / size) ), end='') | |
write(item[1],item[2]) | |
print("\nFinish script - Final number of Question/Answer is {0}".format(size)) |