Created
May 19, 2022 17:39
-
-
Save LunarWatcher/9a06ce1fb21bc26eb6d32c20782386dc to your computer and use it in GitHub Desktop.
Converts the comment data dump from Stack Overflow to SQLite
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sqlite3 as sl3 | |
import re | |
con = sl3.connect("comments.db") | |
cur = con.cursor() | |
cur.execute("""CREATE TABLE IF NOT EXISTS Comments (UserID INTEGER, CommentID INTEGER PRIMARY KEY, Content TEXT, PostID INTEGER)""") | |
con.commit() | |
with open("Comments.xml") as f: | |
print("File loaded") | |
cnt = 0 | |
comments = 1 | |
while line := f.readline(): | |
if "<row" not in line: | |
continue | |
id = re.search("Id=\"(\\d+)\"", line) | |
if (id is None): | |
id = comments + 1 | |
comments += 1 | |
else: | |
id = id.group(1) | |
comments = id | |
postId = re.search("PostId=\"(\\d+)\"", line) | |
if (postId is not None): | |
postId = postId.group(1) | |
text = re.search("Text=\"(.*)\" CreationDate=", line) | |
text = text.group(1) | |
userId = re.search("UserId=\"(\\d+)\"", line) | |
if (userId is None): | |
userId = -1 | |
else: | |
userId = userId.group(1) | |
cur.execute("""INSERT OR REPLACE INTO Comments(UserID, CommentID, Content, PostID) VALUES (?, ?, ?, ?)""", (userId, id, text, postId)) | |
cnt += 1 | |
if (cnt % 10000 == 0): | |
con.commit() | |
cnt = 0 | |
con.commit() | |
con.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment