Skip to content

Instantly share code, notes, and snippets.

@phanect
Last active May 12, 2024 23:26
Show Gist options
  • Save phanect/5606282 to your computer and use it in GitHub Desktop.
Save phanect/5606282 to your computer and use it in GitHub Desktop.
A script used for server migration from kde.gr.jp to jp.kde.org in 2014. This script exported all pages of Pukiwiki to static html pages.
#!/usr/bin/env python
from bs4 import BeautifulSoup
import urllib2
import os
import random
import string
urls = []
DOWNLOAD_DIR = "%s/download" % os.getcwd()
if not os.path.isdir(DOWNLOAD_DIR):
os.makedirs(DOWNLOAD_DIR)
#
# Get page URL list
#
pukiwiki_listurl = "http://www.kde.gr.jp/pukiwiki/index.php?cmd=list"
response = urllib2.urlopen(pukiwiki_listurl)
html = response.read().decode("EUC-JP")
soup = BeautifulSoup(html)
soup = (soup.find(id="body"))
atags = soup.find_all("a") # Find links in <div id="body">
for atag in atags:
url = atag.get("href")
if not url.startswith("#"):
urls.append(url)
#
# Get page contents
#
for url in urls:
response = urllib2.urlopen(url)
page_html = response.read().decode("EUC-JP")
page_soup = BeautifulSoup(page_html)
page_title = page_soup.title.contents
lines = page_soup.find(id="body").contents
body = ""
for line in lines:
body = "%s%s" % (body, str(line))
print body
filename = url.split("?")[1]
filename = filename[0:128] # truncate to prevent "File name too long" error
filepath = "%s/%s.php" % (DOWNLOAD_DIR, filename)
while os.path.isfile(filepath):
filepath = "%s_%s" % (filepath, "".join(random.sample(string.ascii_uppercase, 3)))
print("Writing to %s" % filename)
with open(filepath, "w+") as f:
f.write("""<?php
$page_title = "%s";
$site_root = "../";
include "header.inc";
?>
""" % page_title)
f.write(body)
f.write("<?php include \"footer.inc\"; ?>")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment