Skip to content

Instantly share code, notes, and snippets.

@m1n0
Forked from marktheunissen/htmlfiles_migration.inc.php
Last active August 29, 2015 14:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save m1n0/2e0d4d2b0c22ff98ce79 to your computer and use it in GitHub Desktop.
Save m1n0/2e0d4d2b0c22ff98ce79 to your computer and use it in GitHub Desktop.
<?php
class MyMigration extends Migration {
public $base_dir;
/**
* Constructor.
*/
public function __construct() {
parent::__construct();
// A map of source HTML filename -> destination node id.
$this->map = new MigrateSQLMap($this->machineName,
array(
'sourceid' => array(
'type' => 'varchar',
'length' => 255,
'not null' => TRUE,
)
),
MigrateDestinationNode::getKeySchema()
);
// The source fields.
$fields = array(
'title' => t('Title'),
'body' => t('Body'),
'uid' => t('User id'),
);
// Since the base directory of the HTML files can change depending on the
// environment, we keep it in a variable. There is no interface for this,
// set it using drush vset.
$this->base_dir = variable_get('my_migration_source', '');
// Match HTML files.
$regex = '/.*\.html/';
// The source of the migration is HTML files from the old site.
$list_files = new MigrateListFiles(array($this->base_dir), $this->base_dir, $regex);
$item_file = new MigrateItemFile($this->base_dir);
$this->source = new MigrateSourceList($list_files, $item_file, $fields);
// The destination is the mynode content type.
$this->destination = new MigrateDestinationNode('mynode');
// Map the fields, pretty straightforward in this case.
$this->addFieldMapping('uid', 'uid');
$this->addFieldMapping('title', 'title');
$this->addFieldMapping('body', 'body')
->arguments(array('format' => 'full_html'));
}
/**
* Prepare a row.
*/
public function prepareRow($row) {
// Set to admin for now.
$row->uid = 1;
// Create a new SourceParser to handle HTML content.
$source_parser = new SourceParser(substr($row->sourceid, 1), $row->filedata);
$row->body = $source_parser->getBody();
// The title is the filename.
$row->title = $row->sourceid;
}
}
<?php
// Include QueryPath.
require_once drupal_get_path('module', 'my_migration') . '/libraries/QueryPath-2.1.2-minimal/QueryPath.php';
require_once drupal_get_path('module', 'my_migration') . '/libraries/QueryPath-2.1.2-minimal/Extension/QPXML.php';
class SourceParser {
protected $id;
protected $html;
protected $qp;
/**
* Constructor.
*
* @param $id
* The filename, e.g. pm7205.html
* @param $html
* The full HTML data as loaded from the file.
*/
public function __construct($id, $html) {
$this->id = $id;
$this->html = $html;
$this->charTransform();
$this->fixEncoding();
$this->wrapHTML();
$this->initQP();
$this->stripComments();
}
/**
* Replace characters.
*/
protected function charTransform() {
// We need to strip the Windows CR characters, because otherwise we end up
// with &#13; in the output.
// http://technosophos.com/content/querypath-whats-13-end-every-line
$this->html = str_replace(chr(13), '', $this->html);
}
/**
* Deal with encodings.
*/
protected function fixEncoding() {
// If the content is not UTF8, we assume it's WINDOWS-1252. This fixes
// bogus character issues. Technically it could be ISO-8859-1 but it's safe
// to convert this way.
// http://en.wikipedia.org/wiki/Windows-1252
$enc = mb_detect_encoding($this->html, 'UTF-8', TRUE);
if (!$enc) {
$this->html = mb_convert_encoding($this->html, 'UTF-8', 'WINDOWS-1252');
}
}
/**
* Wrap an HTML fragment in the correct head/meta tags so that UTF-8 is
* correctly detected, and for the parsers and tidiers.
*/
protected function wrapHTML() {
// We add surrounding <html> and <head> tags.
$html = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">';
$html .= '<html xmlns="http://www.w3.org/1999/xhtml"><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /></head><body>';
$html .= $this->html;
$html .= '</body></html>';
$this->html = $html;
}
/**
* Create the QueryPath object.
*/
protected function initQP() {
$qp_options = array(
'convert_to_encoding' => 'utf-8',
'convert_from_encoding' => 'utf-8',
'strip_low_ascii' => FALSE,
);
$this->qp = htmlqp($this->html, NULL, $qp_options);
}
/**
* Remove the comments from the HTML.
*/
protected function stripComments() {
foreach ($this->qp->top()->xpath('//comment()')->get() as $comment) {
$comment->parentNode->removeChild($comment);
}
}
/**
* Return the HTML.
*/
public function getBody() {
$body = $this->qp->top('body')->innerHTML();
$body = trim($body);
return $body;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment