Skip to content

Instantly share code, notes, and snippets.

@cheeseonamonkey
Created March 31, 2024 23:40
Show Gist options
  • Save cheeseonamonkey/1cbee6b0a8d63c79e3ea4e9165233dfa to your computer and use it in GitHub Desktop.
Save cheeseonamonkey/1cbee6b0a8d63c79e3ea4e9165233dfa to your computer and use it in GitHub Desktop.

usage:

ui:

image

shell:

    py wiki_to_text.py '/hdd/Downloads/simplewiki-20240120-pages-meta-current.xml.bz2' `pwd`/output/
import argparse
import tkinter as tk
from tkinter import filedialog, scrolledtext
import webbrowser
import os
import bz2
import json
import re
from html2text import html2text as htt
import wikitextparser as wtp
from threading import Thread
import sys
def dewiki(text):
text = wtp.parse(text).plain_text() # wiki to plaintext
text = htt(text) # remove any HTML
text = text.replace('\\n',' ') # replace newlines
text = re.sub('\s+', ' ', text) # replace excess whitespace
return text
def analyze_chunk(text):
try:
if '<redirect title="' in text: # this is not the main article
return None
if '(disambiguation)' in text: # this is not an article
return None
else:
title = text.split('<title>')[1].split('</title>')[0]
title = htt(title)
if ':' in title: # most articles with : in them are not articles we care about
return None
serial = text.split('<id>')[1].split('</id>')[0]
content = text.split('</text')[0].split('<text')[1].split('>', maxsplit=1)[1]
content = dewiki(content)
return {'title': title.strip(), 'text': content.strip(), 'id': serial.strip()}
except Exception as oops:
print(oops)
return None
def save_article(article, savedir, log_text):
doc = analyze_chunk(article)
if doc:
title = doc['title']
# Replace slashes with underscores in the title
filename = title.replace('/', '_') + '.txt'
filepath = os.path.join(savedir, filename)
if not os.path.exists(filepath): # Check if file exists
with open(filepath, 'w', encoding='utf-8') as outfile:
outfile.write(doc['text'])
log_text.insert(tk.END, filename.ljust(55) + "\n")
print(filename) # Print filename to console
else:
log_text.insert(tk.END, f"File {filename} already exists, skipping...\n")
def process_file_text(filename, savedir, log_text):
try:
article = ''
with open(filename, 'r', encoding='utf-8') as infile:
for line in infile:
if '<page>' in line:
article = ''
elif '</page>' in line: # end of article
Thread(target=save_article, args=(article, savedir, log_text)).start()
else:
article += line
except Exception as e:
log_text.insert(tk.END, f"Error processing article: {str(e)}\n")
print(f"Error processing article: {str(e)}")
def browse_file(entry):
filename = filedialog.askopenfilename()
entry.delete(0, tk.END)
entry.insert(0, filename)
def browse_directory(entry):
directory = filedialog.askdirectory()
entry.delete(0, tk.END)
entry.insert(0, directory)
def decompress_file(xml_file):
if xml_file.endswith('.bz2'):
decompressed_file = xml_file[:-4] # Remove .bz2 extension
with open(decompressed_file, 'wb') as f_out, bz2.BZ2File(xml_file, 'rb') as f_in:
f_out.write(f_in.read())
return decompressed_file
return xml_file
def convert_to_json(xml_file, json_dir, decompress, log_text):
log_text.delete(1.0, tk.END) # Clear previous logs
log_text.insert(tk.END, f"Starting conversion:\n -input: {xml_file}\n")
log_text.insert(tk.END, f" -output: {json_dir}\n")
log_text.update()
log_text.insert(tk.END, "Decompressing...\n")
log_text.update()
xml_file = decompress_file(xml_file) if decompress else xml_file
log_text.insert(tk.END, f"Decompressed: {xml_file}\n")
log_text.update()
log_text.insert(tk.END, "Conversion in progress...\n")
log_text.update()
process_file_text(xml_file, json_dir, log_text)
log_text.insert(tk.END, "Conversion completed!\n")
log_text.update()
if decompress and xml_file.endswith('.bz2'):
os.remove(xml_file)
def open_link(url):
webbrowser.open_new(url)
def main():
parser = argparse.ArgumentParser(description='Process Wikipedia XML dump file and save as text files.')
parser.add_argument('xml_file', help='Wikipedia XML dump file (.xml or .xml.bz2)')
parser.add_argument('json_dir', help='Directory to save the converted JSON files')
args = parser.parse_args()
window = tk.Tk()
window.title("Wikipedia to text")
# Widgets
tk.Label(window, text="Wikipedia XML dump file:").grid(row=0, column=0, padx=5, pady=5)
xml_entry = tk.Entry(window, width=45)
xml_entry.grid(row=0, column=1, padx=5, pady=5)
xml_entry.insert(0, args.xml_file)
tk.Label(window, text="JSON save directory:").grid(row=1, column=0, padx=5, pady=5)
json_entry = tk.Entry(window, width=45)
json_entry.grid(row=1, column=1, padx=5, pady=5)
json_entry.insert(0, args.json_dir)
decompress_var = tk.BooleanVar()
decompress_check = tk.Checkbutton(window, text="Decompress if .bz2", variable=decompress_var)
decompress_check.grid(row=2, columnspan=3, padx=5, pady=5)
log_text = scrolledtext.ScrolledText(window, width=70, height=13)
log_text.grid(row=3, columnspan=3, padx=5, pady=5)
convert_button = tk.Button(window, text="Convert", command=lambda: convert_to_json(xml_entry.get(), json_entry.get(), decompress_var.get(), log_text))
convert_button.grid(row=4, columnspan=3, padx=5, pady=5)
# Add links
link_text = tk.Text(window, height=1, width=65)
link_text.grid(row=5, columnspan=3, padx=5, pady=5)
link_text.insert(tk.END, "Compressed Wikipedia dumps: ")
link_text.tag_configure("link", foreground="blue", underline=True)
link_text.tag_bind("link", "<Button-1>", lambda e: open_link("https://dumps.wikimedia.org/enwiki/20240120/"))
link_text.insert(tk.END, "English (21GB)", "link")
link_text.insert(tk.END, " | ")
link_text.tag_bind("link", "<Button-1>", lambda e: open_link("https://dumps.wikimedia.org/simplewiki/20240120/"))
link_text.insert(tk.END, "Simple English (1GB)", "link")
link_text.config(state=tk.DISABLED)
window.mainloop()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment