Created
May 4, 2012 18:32
-
-
Save marktheunissen/2596787 to your computer and use it in GitHub Desktop.
HTML Files Migration into Drupal using Migrate
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
class MyMigration extends Migration { | |
public $base_dir; | |
/** | |
* Constructor. | |
*/ | |
public function __construct() { | |
parent::__construct(); | |
// A map of source HTML filename -> destination node id. | |
$this->map = new MigrateSQLMap($this->machineName, | |
array( | |
'sourceid' => array( | |
'type' => 'varchar', | |
'length' => 255, | |
'not null' => TRUE, | |
) | |
), | |
MigrateDestinationNode::getKeySchema() | |
); | |
// The source fields. | |
$fields = array( | |
'title' => t('Title'), | |
'body' => t('Body'), | |
'uid' => t('User id'), | |
); | |
// Since the base directory of the HTML files can change depending on the | |
// environment, we keep it in a variable. There is no interface for this, | |
// set it using drush vset. | |
$this->base_dir = variable_get('my_migration_source', ''); | |
// Match HTML files. | |
$regex = '/.*\.html/'; | |
// The source of the migration is HTML files from the old site. | |
$list_files = new MigrateListFiles(array($this->base_dir), $this->base_dir, $regex); | |
$item_file = new MigrateItemFile($this->base_dir); | |
$this->source = new MigrateSourceList($list_files, $item_file, $fields); | |
// The destination is the mynode content type. | |
$this->destination = new MigrateDestinationNode('mynode'); | |
// Map the fields, pretty straightforward in this case. | |
$this->addFieldMapping('uid', 'uid'); | |
$this->addFieldMapping('title', 'title'); | |
$this->addFieldMapping('body', 'body') | |
->arguments(array('format' => 'full_html')); | |
} | |
/** | |
* Prepare a row. | |
*/ | |
public function prepareRow($row) { | |
// Set to admin for now. | |
$row->uid = 1; | |
// Create a new SourceParser to handle HTML content. | |
$source_parser = new SourceParser(substr($row->sourceid, 1), $row->filedata); | |
$row->body = $source_parser->getBody(); | |
// The title is the filename. | |
$row->title = $row->sourceid; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// Include QueryPath. | |
require_once drupal_get_path('module', 'my_migration') . '/libraries/QueryPath-2.1.2-minimal/QueryPath.php'; | |
require_once drupal_get_path('module', 'my_migration') . '/libraries/QueryPath-2.1.2-minimal/Extension/QPXML.php'; | |
class SourceParser { | |
protected $id; | |
protected $html; | |
protected $qp; | |
/** | |
* Constructor. | |
* | |
* @param $id | |
* The filename, e.g. pm7205.html | |
* @param $html | |
* The full HTML data as loaded from the file. | |
*/ | |
public function __construct($id, $html) { | |
$this->id = $id; | |
$this->html = $html; | |
$this->charTransform(); | |
$this->fixEncoding(); | |
$this->wrapHTML(); | |
$this->initQP(); | |
$this->stripComments(); | |
} | |
/** | |
* Replace characters. | |
*/ | |
protected function charTransform() { | |
// We need to strip the Windows CR characters, because otherwise we end up | |
// with in the output. | |
// http://technosophos.com/content/querypath-whats-13-end-every-line | |
$this->html = str_replace(chr(13), '', $this->html); | |
} | |
/** | |
* Deal with encodings. | |
*/ | |
protected function fixEncoding() { | |
// If the content is not UTF8, we assume it's WINDOWS-1252. This fixes | |
// bogus character issues. Technically it could be ISO-8859-1 but it's safe | |
// to convert this way. | |
// http://en.wikipedia.org/wiki/Windows-1252 | |
$enc = mb_detect_encoding($this->html, 'UTF-8', TRUE); | |
if (!$enc) { | |
$this->html = mb_convert_encoding($this->html, 'UTF-8', 'WINDOWS-1252'); | |
} | |
} | |
/** | |
* Wrap an HTML fragment in the correct head/meta tags so that UTF-8 is | |
* correctly detected, and for the parsers and tidiers. | |
*/ | |
protected function wrapHTML() { | |
// We add surrounding <html> and <head> tags. | |
$html = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'; | |
$html .= '<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body>'; | |
$html .= $this->html; | |
$html .= '</body></html>'; | |
$this->html = $html; | |
} | |
/** | |
* Create the QueryPath object. | |
*/ | |
protected function initQP() { | |
$qp_options = array( | |
'convert_to_encoding' => 'utf-8', | |
'convert_from_encoding' => 'utf-8', | |
'strip_low_ascii' => FALSE, | |
); | |
$this->qp = htmlqp($this->html, NULL, $qp_options); | |
} | |
/** | |
* Remove the comments from the HTML. | |
*/ | |
protected function stripComments() { | |
foreach ($this->qp->top()->xpath('//comment()')->get() as $comment) { | |
$comment->parentNode->removeChild($comment); | |
} | |
} | |
/** | |
* Return the HTML. | |
*/ | |
public function getBody() { | |
$body = $this->qp->top('body')->innerHTML(); | |
$body = trim($body); | |
return $body; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment