Skip to content

Instantly share code, notes, and snippets.

@digoreis
Last active March 9, 2022 19:03
Show Gist options
  • Save digoreis/65ece0aa3a77a479c00af2738aef0742 to your computer and use it in GitHub Desktop.
Save digoreis/65ece0aa3a77a479c00af2738aef0742 to your computer and use it in GitHub Desktop.
Sample of parse Posts.xml of Stackoverflow

Parse of Posts.xml of Stackoverflow

This script parse the data of Posts.xml, base of questions and answer of StackOverflow system. In this website has all base publics: Data Stack Exchange

Video

Video

Copyrights

Rodrigo Reis - @digoreis

import xml.etree.ElementTree as ET
import html
import numpy as np
import pandas as pd
import re
import sys
namePostFile = sys.argv[1]
nameExportQuestions = sys.argv[2]
nameExportAnswer = sys.argv[3]
if namePostFile == "" or nameExportQuestions == "" or nameExportAnswer == "" :
print('stkparser.py <Post.xml> <QuestionsOutput> <AnswerOutput>')
sys.exit(2)
def cleanfile(fileName):
f = open(fileName, 'w')
f.close()
def write(question, answer):
with open(nameExportQuestions, 'a') as q:
q.write(question)
q.write("\n")
with open(nameExportAnswer, 'a') as q:
q.write(answer)
q.write("\n")
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
def xml2df(xml_data):
root = ET.XML(xml_data) # element tree
all_records = []
all_records_complete = []
answers = {}
size = len(root)
for i, child in enumerate(root):
record = {}
print("\rLines of Post.xml :: [{0}/{1}] => {2}%".format(i, size,np.around((i * 100) / size) ), end='')
if child.attrib["PostTypeId"] == "1" and "AcceptedAnswerId" in child.attrib.keys() :
record["ID"] = child.attrib["Id"]
record["QUESTION"] = cleanhtml(html.unescape(child.attrib["Body"])).replace('\n', ' ')
all_records.append(record)
if child.attrib["PostTypeId"] == "2" :
answers[child.attrib["ParentId"]] = cleanhtml(html.unescape(child.attrib["Body"])).replace('\n', ' ')
size = len(all_records)
print("")
for i, item in enumerate(all_records):
print("\rProcessing questions :: [{0}/{1}] => {2}%".format(i, size,np.around((i * 100) / size) ), end='')
if item["ID"] in answers.keys():
record = {}
record["QUESTION"] = item["QUESTION"]
record["ANSWER"] = answers[item["ID"]]
all_records_complete.append(record)
print("")
return pd.DataFrame(all_records_complete)
print("Cleanning export files")
cleanfile(nameExportQuestions)
cleanfile(nameExportAnswer)
print("Loading Post.xml file")
xml_data = open(namePostFile).read()
data = xml2df(xml_data)
size = len(data)
for item in data.itertuples():
print("\rLines writes in files :: [{0}/{1}] => {2}%".format(item[0], size,np.around((item[0] * 100) / size) ), end='')
write(item[1],item[2])
print("\nFinish script - Final number of Question/Answer is {0}".format(size))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment