Skip to content

Instantly share code, notes, and snippets.

@ddebin
Last active December 27, 2015 09:28
Show Gist options
  • Save ddebin/7303653 to your computer and use it in GitHub Desktop.
Save ddebin/7303653 to your computer and use it in GitHub Desktop.
Ce script convertit un backup/export DotClear ("blog-backup.txt" flatExport) en posts markdown pour Octopress/Jekyll (ou autres) et en un fichier de commentaires WXR (WordPress eXtended RSS) pour import XML sur Disqus.
<?php
// --
// - Ce script convertit un backup/export DotClear en posts markdown pour Octopress/Jekyll (ou autres)
// et en un fichier de commentaires WXR (WordPress eXtended RSS) pour import XML sur Disqus.
// - Il faut installer le plugin "flatExport" dans DotClear pour récupérer le "blog-backup.txt"
// cf. http://plugins.dotaddict.org/dc1/details/flatExport
// - Vous avez besoin de Pandoc d'installé sur la machine pour la conversion vers Markdown
// cf. http://johnmacfarlane.net/pandoc/
// --
// le prefix du blog pour les URLs de posts dans Disqus
define('POST_WEB_PREFIX', 'http://damiendebin.net/blog/');
// quelques valeurs par défaut mises en entête YAML de chaque post
define('POST_LAYOUT', 'post');
define('POST_CATEGORIES', '[paris.photobloggers.org]');
define('POST_PUBLISHED', 'false');
define('POST_COMMENTS', 'true');
define('POST_TAGS', '[]');
// chemin vers le backup "flatExport"
define('BLOG_BACKUP_PATH', __DIR__.'/blog-backup.txt');
// si vous souhaitez éviter de préciser "author: ..." pour certaines personnes
$SKIP_AUTHORS = array('ddebin');
//$SKIP_AUTHORS = array();
// la timezone des heures utilisées dans le backup "flatExport"
date_default_timezone_set('Europe/Paris');
// -------------
function clean_value(&$v ,$k)
{
$v = str_replace(array('\n', '\r', '\t', '\"', '…', '–', '’', '“', '”', '\~', '\>', '\<'), array("\n", "\r", "\t", '"', '...', '-', "'", '"', '"', '~', '>', '<'), $v);
}
function convert_to_markdown($html)
{
$tmpFile = sprintf("%s/%s", sys_get_temp_dir(), uniqid("pandoc"));
file_put_contents($tmpFile, $html);
$command = 'pandoc --from=html --to=markdown_phpextra --no-wrap '.escapeshellarg($tmpFile);
exec($command, $output);
unlink($tmpFile);
return implode("\n", $output);
}
$r = file(BLOG_BACKUP_PATH, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
// settings
$settings = array();
$i = 0;
while ((strpos($r[$i], '[setting ') !== 0) && ($i < count($r))) $i++;
$keys = str_getcsv(substr($r[$i], strlen('[setting '), -1));
$i++;
while ((substr($r[$i], 0, 1) == '"') && ($i < count($r)))
{
$l = array_combine($keys, str_getcsv($r[$i]));
array_walk($l, 'clean_value');
$settings[$l['setting_id']] = $l;
$i++;
}
// posts
$posts = array();
$i = 0;
while ((strpos($r[$i], '[post ') !== 0) && ($i < count($r))) $i++;
$keys = str_getcsv(substr($r[$i], strlen('[post '), -1));
$i++;
while ((substr($r[$i], 0, 1) == '"') && ($i < count($r)))
{
$l = array_combine($keys, str_getcsv($r[$i]));
array_walk($l, 'clean_value');
$l['comments'] = array();
$posts[$l['post_id']] = $l;
$i++;
}
// comments
$comments = array();
$i = 0;
while ((strpos($r[$i], '[comment ') !== 0) && ($i < count($r))) $i++;
$keys = str_getcsv(substr($r[$i], strlen('[comment '), -1));
$i++;
while ((substr($r[$i], 0, 1) == '"') && ($i < count($r)))
{
$l = array_combine($keys, str_getcsv($r[$i]));
array_walk($l, 'clean_value');
$posts[$l['post_id']]['comments'][] = $l;
$comments[] = $l;
$i++;
}
// users
$users = array();
$i = 0;
while ((strpos($r[$i], '[user ') !== 0) && ($i < count($r))) $i++;
$keys = str_getcsv(substr($r[$i], strlen('[user '), -1));
$i++;
while ((substr($r[$i], 0, 1) == '"') && ($i < count($r)))
{
$l = array_combine($keys, str_getcsv($r[$i]));
array_walk($l, 'clean_value');
$users[$l['user_id']] = $l;
$i++;
}
// export posts w/ comments
// --
$post_dir = __DIR__.'/_posts';
@mkdir($post_dir);
$xml_comment = <<<EOT
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dsq="http://www.disqus.com/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:wp="http://wordpress.org/export/1.0/">
<channel>
EOT;
foreach ($posts as $post)
{
//print_r($post);
$post_md = '';
if (!empty($post['post_chapo'])) $post_md .= convert_to_markdown($post['post_chapo'])."<!-- more -->\n";
if (!empty($post['post_content'])) $post_md .= convert_to_markdown($post['post_content']);
$post_md = html_entity_decode($post_md);
$post_md = preg_replace(array('/-\s{2,}/'), array('- '), $post_md);
$post_md = trim($post_md);
$date = strtotime($post['post_creadt']);
$post_layout = POST_LAYOUT;
$post_categories = POST_CATEGORIES;
$post_published = POST_PUBLISHED;
$post_comments = POST_COMMENTS;
$post_tags = POST_TAGS;
$post_uri = POST_WEB_PREFIX.date("Y/m/d/", $date).$post['post_titre_url'].'/';
$post_date = date('Y-m-d H:i:sP', $date);
$post_title = html_entity_decode($post['post_titre']);
if (strpos($post_title, '"') === FALSE) $post_title = '"'.$post_title.'"';
if (is_array($SKIP_AUTHORS) && in_array($post['user_id'], $SKIP_AUTHORS)) $post_author = '';
else $post_author = "\nauthor: {$users[$post['user_id']]['user_prenom']} {$users[$post['user_id']]['user_nom']}";
$post_content = <<<EOT
---
date: $post_date
layout: $post_layout
title: $post_title$post_author
comments: $post_comments
categories: $post_categories
tags: $post_tags
published: $post_published
---
EOT;
$post_content .= "\n".$post_md;
echo "$post_title\n";
$path = $post_dir.'/'.date('Y-m-d', $date).'-'.$post['post_titre_url'].'.markdown';
file_put_contents($path, $post_content); // write Markdown post file
$post_title = htmlspecialchars(html_entity_decode($post['post_titre']));
$post_uri = htmlspecialchars($post_uri);
$post_date_gmt = htmlspecialchars(gmdate('Y-m-d H:i:s', $date));
if (!empty($post['comments']))
{
$xml_comment .= <<<EOT
<item>
<title>$post_title</title>
<link>$post_uri</link>
<dsq:thread_identifier>$post_uri</dsq:thread_identifier>
<wp:post_date_gmt>$post_date_gmt</wp:post_date_gmt>
<wp:comment_status>open</wp:comment_status>
EOT;
foreach ($post['comments'] as $comment)
{
$comment_date_gmt = gmdate('Y-m-d H:i:s', strtotime($comment['comment_dt']));
$comment_id = htmlspecialchars($comment['comment_id']);
$comment_auteur = htmlspecialchars($comment['comment_auteur']);
$comment_email = htmlspecialchars($comment['comment_email']);
$comment_site = htmlspecialchars($comment['comment_site']);
$comment_ip = htmlspecialchars($comment['comment_ip']);
$comment_pub = htmlspecialchars($comment['comment_pub']);
$xml_comment .= <<<EOT
<wp:comment>
<wp:comment_id>$comment_id</wp:comment_id>
<wp:comment_author>$comment_auteur</wp:comment_author>
<wp:comment_author_email>$comment_email</wp:comment_author_email>
<wp:comment_author_url>$comment_site</wp:comment_author_url>
<wp:comment_author_IP>$comment_ip</wp:comment_author_IP>
<wp:comment_date_gmt>$comment_date_gmt</wp:comment_date_gmt>
<wp:comment_content><![CDATA[{$comment['comment_content']}]]></wp:comment_content>
<wp:comment_approved>$comment_pub</wp:comment_approved>
<wp:comment_parent>0</wp:comment_parent>
</wp:comment>
EOT;
}
$xml_comment .= <<<EOT
</item>
EOT;
}
//break; //debugging purpose
}
$xml_comment .= <<<EOT
</channel>
</rss>
EOT;
file_put_contents(__DIR__.'/comments.xml', $xml_comment); // write WXR compatible XML file for Disqus
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment