Koasing/ttxml2wpjson.py

## ttxml2wpjson.py
from datetime import datetime
from urllib.parse import quote
from xml.etree import ElementTree

import json
import base64
import os
import os.path
import re

# base URL : WordPress base URL.
# do not forget to add trailing "/"
site = 'http://your_domain/'

base_url = site + 'wordpress/'

# GUID pattern : unique pattern for each posting
guid_pattern = 'http://your.tistory.com/{}'


post_status = { 'public'  : 'publish',  \
                'private' : 'private',  \
                'draft'   : 'draft',    \
                'trash'   : 'trash'     }

tree = ElementTree.parse(source = 'Tistory-Backup.xml')
root = tree.getroot()

authors    = {}

posts      = []
categories = []
tags       = []
terms      = []                         # terms array is currently not used but implemented for future extension

for raw_category in root.findall('category'):
    # Category handling loop
    category = {}

    # Weight level-1 category ID. Usually 100 is sufficient.
    # if the blog has more than 100 categories (which is impractical), increase weight value.
    category['term_id'] = int(raw_category.find('priority').text) * 100

    category['cat_name'] = raw_category.find('name').text
    category['category_nicename'] = quote(category['cat_name'].replace(' ', '-')).lower()
    category['category_parent'] = ''
    category['category_description'] = ''   # tistory has no category description

    categories.append(category)

    parent_id = category['term_id']
    parent_slug = category['category_nicename']

    for raw_subcategory in raw_category.findall('category'):
        # level-2 category.
        subcategory = {}

        # add to its parent category
        subcategory['term_id'] = int(raw_subcategory.find('priority').text) + parent_id
        subcategory['cat_name'] = raw_subcategory.find('name').text
        subcategory['category_nicename'] = quote(subcategory['cat_name'].replace(' ', '-')).lower()
        subcategory['category_parent'] = parent_slug
        subcategory['category_description'] = ''

        categories.append(subcategory)

def process_body(content, attachment, useLessMargin = False):
    # converts tistory-specific tags to WordPress tags

    def process_moreless(m):
        # process more/less tags, "[#M_(more)|(less)|(content)_M#]"
        # depends on "WP show more plugin" ( https://ko.wordpress.org/plugins/wp-show-more/ ).
        c = '[show_more more="{}" less="{}"]{}[/show_more]'.format(*m.groups())
        return c

    def process_image(m):
        # process image tags, "[##_(count)(align)(|(filename)|(attribute)|(caption))(...)(...)_##]"
        count = int(m.group(1))
        align = {'C':'aligncenter', 'L':'alignleft', 'R':'alignright'}.get(m.group(2), 'alignnone')

        # split
        tags = m.group(3).split('|')
        #assault len(tags) == 3 * count

        images = []
        for tag in (tags[n:n+3] for n in range(0, len(tags), 3)):
            # find filename from attachments
            params = attachment[tag[0].strip()]

            params['align'] = align
            params['base']  = base_url
            params['attr']  = tag[1].strip()
            params['caption'] = tag[2].strip()

            if re.match('image', params['mime']):
                if len(params['caption']) > 0:
                    c = '[caption id="attachment_{id}" align="{align}" width="{width}"]<img class="wp-image-{id} size-full" src="{url}" {attr} /> {caption}[/caption]'.format_map(params)
                else:
                    c = '<img class="wp-image-{id} size-full {align}" src="{url}" {attr} />'.format_map(params)
            else:
                c = '<a href="{url}">{label}</a>'.format_map(params)

            images.append(c)

        return ''.join(images)

    def process_gallery(m):
        # process gallery tags, "[##_Gallery(|(filename)|(catpion))(...)(...)|(attribute)_##]"
        align = 'alignnone'

        #split
        tags = m.group(1).split('|')
        #assult len(tags) == odd

        # last tag item is attribute. split array
        attr = tags[-1].strip()
        tags = tags[:-1]

        images = []
        for tag in (tags[n:n+2] for n in range(0, len(tags), 2)):
            # find filename from attachments
            params = attachment[tag[0].strip()]

            params['align'] = align
            params['base']  = base_url
            params['attr']  = attr
            params['caption'] = tag[1].strip()

            if re.match('image', params['mime']):
                if len(params['caption']) > 0:
                    c = '[caption id="attachment_{id}" align="{align}" width="{width}"]<img class="wp-image-{id} size-full" src="{url}" {attr} /> {caption}[/caption]'.format_map(params)
                else:
                    c = '<img class="wp-image-{id} size-full {align}" src="{url}" {attr} />'.format_map(params)
            else:
                c = '<a href="{url}">{label}</a>'.format_map(params)

            images.append(c)

        return ''.join(images)

    # remove line-breaking in html source
    content = re.sub('<pre([\s\S]*?)\/pre>', lambda m: '<pre' + m.group(1).replace('\n', '<br />') + '/pre>', content)
    content = content.replace('\n', '')

    # process tistory-specific tags
    content = re.sub('\[#M_(.+?)\|(.+?)\|([\s\S]+?)_M#\]', process_moreless, content)
    content = re.sub('\[##_(\d)([CLR])\|(.+?)_##\]',       process_image,    content)
    content = re.sub('\[##_Gallery\|(.+?)_##\]',           process_gallery,  content)

    # strip P tag
    if useLessMargin is True:
        # p tag has no bottom margin.
        content = re.sub('<[pP]>\s*?<\/[pP]>', '\n', content)
        content = re.sub('<[pP]><br ?\/?><\/[pP]>', '\n', content)
        content = re.sub('<[pP]>(.*?)<\/[pP]>', '\\1\n', content)
    else:
        # p tag has bottom margin
        content = re.sub('<[pP]>\s*?<\/[pP]>', '\n\n', content)
        content = re.sub('<[pP]><br ?\/?><\/[pP]>', '\n\n', content)
        content = re.sub('<[pP]>(.*?)<\/[pP]>', '\\1\n\n', content)

    # process simple tags. should not overlap
    replace_map = {'<br>'   : '\n',
                   '<br/>'  : '\n',
                   '<br />' : '\n',
                   '&nbsp;' : ' ',
                   '&quot'  : '"'}

    #content = reduce(lambda content, old : content.replace(old, replace_map[old]), replace_map, content)
    for old, new in replace_map.items():
        content = content.replace(old, new)

    # strip useless span tag
    content = re.sub('<span style="(.*?)font-size: 9pt;(.*?)">',    '<span style="\\1\\2">', content)
    content = re.sub('<span style="(.*?)line-height: 1\.5;(.*?)">', '<span style="\\1\\2">', content)
    content = re.sub('<span style="(.*?)background-color: transparent;(.*?)">', '<span style="\\1\\2">', content)
    content = re.sub('<span style="\s*?">([\s\S]*?)<\/span>', '\\1', content)

    return content

def process_comment(raw_comment, next_id, parent_id):
    global authors
    comments = []

    # parse comment
    raw_author = raw_comment.find('commenter')

    author = authors.get(raw_author.get('id'))
    if author is not None:
        author = author['author_id']
    else:
        author = 0


    comment = {}

    comment['comment_id'] = next_id
    comment['comment_author'] = raw_author.find('name').text
    comment['comment_author_email'] = ''                                        # tistory does not support comment email
    comment['comment_author_IP'] = raw_author.find('ip').text
    comment['comment_author_url'] = raw_author.find('homepage').text
    comment['comment_date'] = datetime.fromtimestamp(int(raw_comment.find('written').text)).strftime('%Y-%m-%d %H:%M:%S')
    comment['comment_date_gmt'] = comment['comment_date']
    comment['comment_content'] = raw_comment.find('content').text
    comment['comment_approved'] = '1'                                           # i don't know why it is string
    comment['comment_type'] = ''
    comment['comment_parent'] = str(parent_id)
    comment['comment_user_id'] = author
    comment['commentmeta'] = []

    comments.append(comment)

    #increase next comment id
    next_id += 1

    # parse subcomments
    for raw_subcomment in raw_comment.findall('comment'):
        subcomments = process_comment(raw_subcomment, next_id, comment['comment_id'])
        comments += subcomments
        next_id += len(subcomments)

    return comments


# author and tag are included in post
def process_post(key_tag):
    global posts, authors, tags, terms
    global root

    for raw_post in root.findall(key_tag):
        print('Processing id {}'.format(raw_post.find('id').text))
        author = {}
        post = {}

        # author. tistory manages author information internally, and does not provide in backup text. only author ID is provided.
        # build author info randomly. user should manage how to merge author information.
        author['author_id'] = int(raw_post.find('author').text)
        author['author_login'] = str(author['author_id'])
        author['author_email'] = 'fakemail@fakemail.com'
        author['author_display_name'] = str(author['author_id'])
        author['author_first_name'] = ''
        author['author_last_name'] = ''

        if authors.get(author['author_login']) is None:
            authors[author['author_login']] = author

        # post
        post['post_title'] = raw_post.find('title').text
        post['guid'] = guid_pattern.format(raw_post.find('id').text)
        post['post_author'] = raw_post.find('author').text
        post['post_content'] = ''
        post['post_excerpt'] = ''                   # tistory doesn't support excerpt text
        post['post_id'] = int(raw_post.find('id').text)
        post['post_date'] = datetime.fromtimestamp(int(raw_post.find('created').text)).strftime('%Y-%m-%d %H:%M:%S')
        post['post_date_gmt'] = post['post_date']
        post['comment_status'] = "open" if key_tag == 'post' and raw_post.find('acceptComment').text == "1" else "closed"
        post['ping_status'] = "open" if key_tag == 'post' and raw_post.find('acceptTrackback').text == "1" else "closed"
        post['post_name'] = quote(raw_post.attrib['slogan'] if key_tag == 'post' else raw_post.find('title').text.replace(' ','-')).lower()
        post['status'] = post_status.get(raw_post.find('visibility').text, 'private')   # fallback to private if unknown status
        post['post_parent'] = 0                     # tistory doesn't support post chaining... will be used for attachments
        post['menu_order'] = 0
        post['post_type'] = 'post' if key_tag == 'post' else 'page'
        post['post_password'] = raw_post.find('password').text if post['status'] == 'private' and raw_post.find('password') is not None else ''
        post['is_sticky'] = 0
        post['attachment_url'] = ''
        post['terms'] = []                          # category and tags
        post['postmeta'] = []                       # post metadata
        post['comments'] = []

        for raw_comment in raw_post.findall('comment'):
            post['comments'] = post['comments'] + process_comment(raw_comment, post['post_id'] * 100 + len(post['comments']), 0)

        attachments = []
        attachments_table = {}
        # process attachments
        for raw_attachment in raw_post.findall('attachment'):
            label = raw_attachment.find('label').text
            # tistory bug handling (empty attachment)
            if label is None:
                continue

            label = label.replace(' ', '_')

            dt = datetime.fromtimestamp(int(raw_attachment.find('attached').text))
            fn = 'attach/{}/{}'.format(post['post_id'], label)

            attachment = {}
            attachment['post_title'] = os.path.splitext(label)[0]
            attachment['guid'] = ''
            attachment['post_author'] = post['post_author']
            attachment['post_content'] = ''
            attachment['post_excerpt'] = ''
            attachment['post_id'] = post['post_id'] * 100 + len(post['comments']) + len(attachments)
            attachment['post_date'] = dt.strftime('%Y-%m-%d %H:%M:%S')
            attachment['post_date_gmt'] = attachment['post_date']
            attachment['comment_status'] = 'closed'
            attachment['ping_status'] = 'closed'
            attachment['post_name'] = quote(attachment['post_title'].replace(' ','-')).lower()
            attachment['status'] = 'inherit'
            attachment['post_parent'] = post['post_id']
            attachment['menu_order'] = 0
            attachment['post_type'] = 'attachment'
            attachment['post_password'] = ''
            attachment['is_sticky'] = 0
            attachment['attachment_url'] = site + fn
            attachments.append(attachment)

            os.makedirs(os.path.dirname(fn), exist_ok=True)

            with open(fn, 'wb') as outfile:
                outfile.write(base64.b64decode(raw_attachment.find('content').text))

            sanitized = quote(label).lower()
            for s in ["?", "[", "]", "/", "\\", "=", "<", ">", ":", ";", ",", "'", "\"", "&", "$", "#", "*", "(", ")", "|", "~", "`", "!", "{", "}", "%", "+"]:
                sanitized = sanitized.replace(s, '')
            for s in ["%20", "+"]:
                sanitized = sanitized.replace(s, '-')
            sanitized = re.sub('[\r\n\t -]+', '-', sanitized)

            attachments_table[raw_attachment.find('name').text] = raw_attachment.attrib.copy()
            attachments_table[raw_attachment.find('name').text].update({'label':sanitized,
                                                                        'year':dt.year,
                                                                        'month':dt.month,
                                                                        'id':attachment['post_id'],
                                                                        'url':attachment['attachment_url']})

        # process tags
        for category in raw_post.findall('category'):
            term = {}
            term['name'] = category.text
            term['slug'] = quote(category.text.replace(' ','-')).lower()
            term['domain'] = 'category'
            post['terms'].append(term)

        for tag in raw_post.findall('tag'):
            term = {}
            term['name'] = tag.text
            term['slug'] = quote(tag.text.replace(' ','-')).lower()
            term['domain'] = 'post_tag'
            post['terms'].append(term)

        if len(post['terms']) == 0:
            del post['terms']

        if len(post['postmeta']) == 0:
            del post['postmeta']

        post['post_content'] = process_body(raw_post.find('content').text, attachments_table, True if raw_post.find('uselessMargin').text == '1' else False)

        posts.append(post)
        posts = posts + attachments

        # tag. a post may have multiple tags.
        for raw_tag in raw_post.findall('tag'):
            tag = {}
            tag['term_id'] = len(tags)              # assign unique (sequential) value.
            tag['tag_name'] = raw_tag.text
            tag['tag_slug'] = quote(tag['tag_name'].replace(' ','-')).lower()
            tag['tag_description'] = ''

            # check duplicated
            for index, item in enumerate(tags):
                if item['tag_name'] == tag['tag_name']:
                    break
            else:
                tags.append(tag)

process_post('post')
process_post('notice')

# merge
backup = {'authors':authors, 'posts':posts, 'categories':categories, 'tags':tags, 'terms':[], 'base_url':base_url, 'version':'1.2'}

with open('export.json', 'w', encoding='utf8') as outfile:
    json.dump(backup, outfile, ensure_ascii = False, indent = 2)
	from datetime import datetime
	from urllib.parse import quote
	from xml.etree import ElementTree

	import json
	import base64
	import os
	import os.path
	import re

	# base URL : WordPress base URL.
	# do not forget to add trailing "/"
	site = 'http://your_domain/'

	base_url = site + 'wordpress/'

	# GUID pattern : unique pattern for each posting
	guid_pattern = 'http://your.tistory.com/{}'



	post_status = { 'public' : 'publish', \
	'private' : 'private', \
	'draft' : 'draft', \
	'trash' : 'trash' }

	tree = ElementTree.parse(source = 'Tistory-Backup.xml')
	root = tree.getroot()

	authors = {}

	posts = []
	categories = []
	tags = []
	terms = [] # terms array is currently not used but implemented for future extension

	for raw_category in root.findall('category'):
	# Category handling loop
	category = {}

	# Weight level-1 category ID. Usually 100 is sufficient.
	# if the blog has more than 100 categories (which is impractical), increase weight value.
	category['term_id'] = int(raw_category.find('priority').text) * 100

	category['cat_name'] = raw_category.find('name').text
	category['category_nicename'] = quote(category['cat_name'].replace(' ', '-')).lower()
	category['category_parent'] = ''
	category['category_description'] = '' # tistory has no category description

	categories.append(category)

	parent_id = category['term_id']
	parent_slug = category['category_nicename']

	for raw_subcategory in raw_category.findall('category'):
	# level-2 category.
	subcategory = {}

	# add to its parent category
	subcategory['term_id'] = int(raw_subcategory.find('priority').text) + parent_id
	subcategory['cat_name'] = raw_subcategory.find('name').text
	subcategory['category_nicename'] = quote(subcategory['cat_name'].replace(' ', '-')).lower()
	subcategory['category_parent'] = parent_slug
	subcategory['category_description'] = ''

	categories.append(subcategory)

	def process_body(content, attachment, useLessMargin = False):
	# converts tistory-specific tags to WordPress tags

	def process_moreless(m):
	# process more/less tags, "[#M_(more)\|(less)\|(content)_M#]"
	# depends on "WP show more plugin" ( https://ko.wordpress.org/plugins/wp-show-more/ ).
	c = '[show_more more="{}" less="{}"]{}[/show_more]'.format(*m.groups())
	return c

	def process_image(m):
	# process image tags, "[##_(count)(align)(\|(filename)\|(attribute)\|(caption))(...)(...)_##]"
	count = int(m.group(1))
	align = {'C':'aligncenter', 'L':'alignleft', 'R':'alignright'}.get(m.group(2), 'alignnone')

	# split
	tags = m.group(3).split('\|')
	#assault len(tags) == 3 * count

	images = []
	for tag in (tags[n:n+3] for n in range(0, len(tags), 3)):
	# find filename from attachments
	params = attachment[tag[0].strip()]

	params['align'] = align
	params['base'] = base_url
	params['attr'] = tag[1].strip()
	params['caption'] = tag[2].strip()

	if re.match('image', params['mime']):
	if len(params['caption']) > 0:
	c = '[caption id="attachment_{id}" align="{align}" width="{width}"]<img class="wp-image-{id} size-full" src="{url}" {attr} /> {caption}[/caption]'.format_map(params)
	else:
	c = '<img class="wp-image-{id} size-full {align}" src="{url}" {attr} />'.format_map(params)
	else:
	c = '<a href="{url}">{label}</a>'.format_map(params)

	images.append(c)

	return ''.join(images)

	def process_gallery(m):
	# process gallery tags, "[##_Gallery(\|(filename)\|(catpion))(...)(...)\|(attribute)_##]"
	align = 'alignnone'

	#split
	tags = m.group(1).split('\|')
	#assult len(tags) == odd

	# last tag item is attribute. split array
	attr = tags[-1].strip()
	tags = tags[:-1]

	images = []
	for tag in (tags[n:n+2] for n in range(0, len(tags), 2)):
	# find filename from attachments
	params = attachment[tag[0].strip()]

	params['align'] = align
	params['base'] = base_url
	params['attr'] = attr
	params['caption'] = tag[1].strip()

	if re.match('image', params['mime']):
	if len(params['caption']) > 0:
	c = '[caption id="attachment_{id}" align="{align}" width="{width}"]<img class="wp-image-{id} size-full" src="{url}" {attr} /> {caption}[/caption]'.format_map(params)
	else:
	c = '<img class="wp-image-{id} size-full {align}" src="{url}" {attr} />'.format_map(params)
	else:
	c = '<a href="{url}">{label}</a>'.format_map(params)

	images.append(c)

	return ''.join(images)

	# remove line-breaking in html source
	content = re.sub('<pre([\s\S]*?)\/pre>', lambda m: '<pre' + m.group(1).replace('\n', '<br />') + '/pre>', content)
	content = content.replace('\n', '')

	# process tistory-specific tags
	content = re.sub('\[#M_(.+?)\\|(.+?)\\|([\s\S]+?)_M#\]', process_moreless, content)
	content = re.sub('\[##_(\d)([CLR])\\|(.+?)_##\]', process_image, content)
	content = re.sub('\[##_Gallery\\|(.+?)_##\]', process_gallery, content)

	# strip P tag
	if useLessMargin is True:
	# p tag has no bottom margin.
	content = re.sub('<[pP]>\s*?<\/[pP]>', '\n', content)
	content = re.sub('<[pP]><br ?\/?><\/[pP]>', '\n', content)
	content = re.sub('<[pP]>(.*?)<\/[pP]>', '\\1\n', content)
	else:
	# p tag has bottom margin
	content = re.sub('<[pP]>\s*?<\/[pP]>', '\n\n', content)
	content = re.sub('<[pP]><br ?\/?><\/[pP]>', '\n\n', content)
	content = re.sub('<[pP]>(.*?)<\/[pP]>', '\\1\n\n', content)

	# process simple tags. should not overlap
	replace_map = {'<br>' : '\n',
	'<br/>' : '\n',
	'<br />' : '\n',
	' ' : ' ',
	'&quot' : '"'}

	#content = reduce(lambda content, old : content.replace(old, replace_map[old]), replace_map, content)
	for old, new in replace_map.items():
	content = content.replace(old, new)

	# strip useless span tag
	content = re.sub('<span style="(.?)font-size: 9pt;(.?)">', '<span style="\\1\\2">', content)
	content = re.sub('<span style="(.?)line-height: 1\.5;(.?)">', '<span style="\\1\\2">', content)
	content = re.sub('<span style="(.?)background-color: transparent;(.?)">', '<span style="\\1\\2">', content)
	content = re.sub('<span style="\s?">([\s\S]?)<\/span>', '\\1', content)

	return content

	def process_comment(raw_comment, next_id, parent_id):
	global authors
	comments = []

	# parse comment
	raw_author = raw_comment.find('commenter')

	author = authors.get(raw_author.get('id'))
	if author is not None:
	author = author['author_id']
	else:
	author = 0


	comment = {}

	comment['comment_id'] = next_id
	comment['comment_author'] = raw_author.find('name').text
	comment['comment_author_email'] = '' # tistory does not support comment email
	comment['comment_author_IP'] = raw_author.find('ip').text
	comment['comment_author_url'] = raw_author.find('homepage').text
	comment['comment_date'] = datetime.fromtimestamp(int(raw_comment.find('written').text)).strftime('%Y-%m-%d %H:%M:%S')
	comment['comment_date_gmt'] = comment['comment_date']
	comment['comment_content'] = raw_comment.find('content').text
	comment['comment_approved'] = '1' # i don't know why it is string
	comment['comment_type'] = ''
	comment['comment_parent'] = str(parent_id)
	comment['comment_user_id'] = author
	comment['commentmeta'] = []

	comments.append(comment)

	#increase next comment id
	next_id += 1

	# parse subcomments
	for raw_subcomment in raw_comment.findall('comment'):
	subcomments = process_comment(raw_subcomment, next_id, comment['comment_id'])
	comments += subcomments
	next_id += len(subcomments)

	return comments


	# author and tag are included in post
	def process_post(key_tag):
	global posts, authors, tags, terms
	global root

	for raw_post in root.findall(key_tag):
	print('Processing id {}'.format(raw_post.find('id').text))
	author = {}
	post = {}

	# author. tistory manages author information internally, and does not provide in backup text. only author ID is provided.
	# build author info randomly. user should manage how to merge author information.
	author['author_id'] = int(raw_post.find('author').text)
	author['author_login'] = str(author['author_id'])
	author['author_email'] = 'fakemail@fakemail.com'
	author['author_display_name'] = str(author['author_id'])
	author['author_first_name'] = ''
	author['author_last_name'] = ''

	if authors.get(author['author_login']) is None:
	authors[author['author_login']] = author

	# post
	post['post_title'] = raw_post.find('title').text
	post['guid'] = guid_pattern.format(raw_post.find('id').text)
	post['post_author'] = raw_post.find('author').text
	post['post_content'] = ''
	post['post_excerpt'] = '' # tistory doesn't support excerpt text
	post['post_id'] = int(raw_post.find('id').text)
	post['post_date'] = datetime.fromtimestamp(int(raw_post.find('created').text)).strftime('%Y-%m-%d %H:%M:%S')
	post['post_date_gmt'] = post['post_date']
	post['comment_status'] = "open" if key_tag == 'post' and raw_post.find('acceptComment').text == "1" else "closed"
	post['ping_status'] = "open" if key_tag == 'post' and raw_post.find('acceptTrackback').text == "1" else "closed"
	post['post_name'] = quote(raw_post.attrib['slogan'] if key_tag == 'post' else raw_post.find('title').text.replace(' ','-')).lower()
	post['status'] = post_status.get(raw_post.find('visibility').text, 'private') # fallback to private if unknown status
	post['post_parent'] = 0 # tistory doesn't support post chaining... will be used for attachments
	post['menu_order'] = 0
	post['post_type'] = 'post' if key_tag == 'post' else 'page'
	post['post_password'] = raw_post.find('password').text if post['status'] == 'private' and raw_post.find('password') is not None else ''
	post['is_sticky'] = 0
	post['attachment_url'] = ''
	post['terms'] = [] # category and tags
	post['postmeta'] = [] # post metadata
	post['comments'] = []

	for raw_comment in raw_post.findall('comment'):
	post['comments'] = post['comments'] + process_comment(raw_comment, post['post_id'] * 100 + len(post['comments']), 0)

	attachments = []
	attachments_table = {}
	# process attachments
	for raw_attachment in raw_post.findall('attachment'):
	label = raw_attachment.find('label').text
	# tistory bug handling (empty attachment)
	if label is None:
	continue

	label = label.replace(' ', '_')

	dt = datetime.fromtimestamp(int(raw_attachment.find('attached').text))
	fn = 'attach/{}/{}'.format(post['post_id'], label)

	attachment = {}
	attachment['post_title'] = os.path.splitext(label)[0]
	attachment['guid'] = ''
	attachment['post_author'] = post['post_author']
	attachment['post_content'] = ''
	attachment['post_excerpt'] = ''
	attachment['post_id'] = post['post_id'] * 100 + len(post['comments']) + len(attachments)
	attachment['post_date'] = dt.strftime('%Y-%m-%d %H:%M:%S')
	attachment['post_date_gmt'] = attachment['post_date']
	attachment['comment_status'] = 'closed'
	attachment['ping_status'] = 'closed'
	attachment['post_name'] = quote(attachment['post_title'].replace(' ','-')).lower()
	attachment['status'] = 'inherit'
	attachment['post_parent'] = post['post_id']
	attachment['menu_order'] = 0
	attachment['post_type'] = 'attachment'
	attachment['post_password'] = ''
	attachment['is_sticky'] = 0
	attachment['attachment_url'] = site + fn
	attachments.append(attachment)

	os.makedirs(os.path.dirname(fn), exist_ok=True)

	with open(fn, 'wb') as outfile:
	outfile.write(base64.b64decode(raw_attachment.find('content').text))

	sanitized = quote(label).lower()
	for s in ["?", "[", "]", "/", "\\", "=", "<", ">", ":", ";", ",", "'", "\"", "&", "$", "#", "*", "(", ")", "\|", "~", "`", "!", "{", "}", "%", "+"]:
	sanitized = sanitized.replace(s, '')
	for s in ["%20", "+"]:
	sanitized = sanitized.replace(s, '-')
	sanitized = re.sub('[\r\n\t -]+', '-', sanitized)

	attachments_table[raw_attachment.find('name').text] = raw_attachment.attrib.copy()
	attachments_table[raw_attachment.find('name').text].update({'label':sanitized,
	'year':dt.year,
	'month':dt.month,
	'id':attachment['post_id'],
	'url':attachment['attachment_url']})

	# process tags
	for category in raw_post.findall('category'):
	term = {}
	term['name'] = category.text
	term['slug'] = quote(category.text.replace(' ','-')).lower()
	term['domain'] = 'category'
	post['terms'].append(term)

	for tag in raw_post.findall('tag'):
	term = {}
	term['name'] = tag.text
	term['slug'] = quote(tag.text.replace(' ','-')).lower()
	term['domain'] = 'post_tag'
	post['terms'].append(term)

	if len(post['terms']) == 0:
	del post['terms']

	if len(post['postmeta']) == 0:
	del post['postmeta']

	post['post_content'] = process_body(raw_post.find('content').text, attachments_table, True if raw_post.find('uselessMargin').text == '1' else False)

	posts.append(post)
	posts = posts + attachments

	# tag. a post may have multiple tags.
	for raw_tag in raw_post.findall('tag'):
	tag = {}
	tag['term_id'] = len(tags) # assign unique (sequential) value.
	tag['tag_name'] = raw_tag.text
	tag['tag_slug'] = quote(tag['tag_name'].replace(' ','-')).lower()
	tag['tag_description'] = ''

	# check duplicated
	for index, item in enumerate(tags):
	if item['tag_name'] == tag['tag_name']:
	break
	else:
	tags.append(tag)

	process_post('post')
	process_post('notice')

	# merge
	backup = {'authors':authors, 'posts':posts, 'categories':categories, 'tags':tags, 'terms':[], 'base_url':base_url, 'version':'1.2'}

	with open('export.json', 'w', encoding='utf8') as outfile:
	json.dump(backup, outfile, ensure_ascii = False, indent = 2)