Skip to content

Instantly share code, notes, and snippets.

Last active December 20, 2015 04:19
Show Gist options
  • Save fsuter/6070209 to your computer and use it in GitHub Desktop.
Save fsuter/6070209 to your computer and use it in GitHub Desktop.
Example files for importing WordPress entries into TYPO3 (in a custom table) using extension external_import.
* Copyright notice
* (c) 2010 Francois Suter (Cobweb) <>
* All rights reserved
* This script is part of the TYPO3 project. The TYPO3 project is
* free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
* The GNU General Public License can be found at
* This script is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* GNU General Public License for more details.
* This copyright notice MUST APPEAR in all copies of the script!
* This class hooks into the external import process to modify some data
* @author Francois Suter (Cobweb) <>
* @package TYPO3
* @subpackage tx_journal
class tx_journal_externalimport_hooks {
static protected $wpCategoriesToDamCategories = array(
1 => 19,
2 => 6,
3 => 7,
4 => 5,
5 => 11,
6 => 14,
7 => 18,
8 => 16,
9 => 12,
10 => 29,
11 => 19,
12 => 21,
static protected $additionalDamCategories = array(
3 => array(8),
4 => array(15),
8 => array(5)
* This method processes an array of data coming from external_import, just after it was read from the external source
* @param array $records The data to process
* @param tx_externalimport_importer $importerObject Back-reference to the external_import object
* @return array The modified data array
public function preprocessRawRecordset($records, tx_externalimport_importer $importerObject) {
$table = $importerObject->getTableName();
$index = $importerObject->getIndex();
// For imports referring to file paths, extract only the file name
if (($table == 'tx_dam' && $index == 0)) {
$numRecords = count($records);
for ($i = 0; $i < $numRecords; $i++) {
$urlParts = parse_url($records[$i]['file_name']);
$records[$i]['file_name'] = basename($urlParts['path']);
} elseif (($table == 'tx_journal_entries' && $index == 1)) {
$numRecords = count($records);
$counterPerPost = array();
for ($i = 0; $i < $numRecords; $i++) {
$externalId = $records[$i]['wp_id'];
if (isset($counterPerPost[$externalId])) {
} else {
$counterPerPost[$externalId] = 0;
$urlParts = parse_url($records[$i]['gallery']);
// If this is the first image and the post start with an img tag, this is the thumbnail
if ($counterPerPost[$externalId] == 0 && strpos($records[$i]['bodytext'], '<img') === 0) {
$records[$i]['thumbnail'] = basename($urlParts['path']);
// Otherwise it's a gallery image
} else {
$records[$i]['gallery'] = basename($urlParts['path']);
} elseif (($table == 'tx_journal_entries' && $index == 2)) {
// Get the existing entries
$journalEntries = $GLOBALS['TYPO3_DB']->exec_SELECTgetRows('uid, wp_id', 'tx_journal_entries', 'wp_id > 0 AND pid = 69', '', '', '', 'wp_id');
// t3lib_div::devLog('Existing entries', 'external_import', 0, $journalEntries);
$numRecords = count($records);
// This array will contain related entries beyond the first ones
// (i.e. copies of the original record, with relation to the additional entries)
$relatedEntries = array();
for ($i = 0; $i < $numRecords; $i++) {
// Find if there's an opening image tag and remove it
$records[$i]['bodytext'] = preg_replace('/^<img\s(.+?)\s\/>/', '', $records[$i]['bodytext']);
$relatedEntries[$i] = array();
// Remove the "default" related entry
$matches = array();
$result = preg_match_all('/<a\s(.*?)href="([^"]+?)"[^>]*?>(.+?)<\/a>/', $records[$i]['bodytext'], $matches);
if ($result) {
// Make sure to replace at least all "target blank" attributes with nothing
$searchMap = array(
' target="_blank"'
$replaceMap = array(
// t3lib_div::devLog('Matches', 'external_import', 0, $matches);
for ($j = 0; $j < $result; $j++) {
$urlParts = parse_url($matches[2][$j]);
// Check if it's a local URL
if ($urlParts['host'] == '') {
// It's an image, we want to remove the whole link and its content
if (strpos($urlParts['path'], 'wp-content') !== FALSE) {
$searchMap[] = $matches[0][$j];
$replaceMap[] = '';
// It's a journal entry
} elseif (strpos($urlParts['path'], 'francois/blog') !== FALSE) {
// Extract the Wordpress id and try to match it to an imported entry
$queryParameters = t3lib_div::explodeUrl2Array($urlParts['query']);
// t3lib_div::devLog('Query parameters', 'external_import', 0, $queryParameters);
if (isset($queryParameters['p']) && isset($journalEntries[$queryParameters['p']])) {
$entryId = $journalEntries[$queryParameters['p']]['uid'];
$searchMap[] = 'href="' . $matches[2][$j] . '"';
$replaceMap[] = 'href="record:tx_journal_entries:' . $entryId . '" external="1"';
// Add a "related" entry (if it's the first)
if (count($relatedEntries[$i]) == 0) {
$records[$i]['related'] = $queryParameters['p'];
// Keep record of all related entries
$relatedEntries[$i][] = $queryParameters['p'];
// It's some other link, do nothing but record it in the devLog
} else {
t3lib_div::devLog('Not a content link: ' . $matches[2][$j], 'external_import', 1);
// It's an external link, do nothing but record it in the devLog
} else {
t3lib_div::devLog('Not a local link: ' . $matches[2][$j], 'external_import', 1);
if (count($searchMap) > 0) {
// t3lib_div::devLog('Search and replace', 'external_import', 0, array('search' => $searchMap, 'replace' => $replaceMap));
$searchMap[] = '<p align="center"></p>';
$replaceMap[] = '';
$replaced = str_replace($searchMap, $replaceMap, $records[$i]['bodytext']);
// t3lib_div::devLog('Cleaned up entry', 'external_import', 0, array(htmlspecialchars($replaced)));
$records[$i]['bodytext'] = str_replace($searchMap, $replaceMap, $records[$i]['bodytext']);
// WordPress does not store <p> tags unless they have attributes. It used line breaks instead.
// So explode bodytext on line breaks and reinstate <p> tags wherever necessary
// Take this opportunity to remove empty paragraphs
$paragraphs = t3lib_div::trimExplode("\n", $records[$i]['bodytext'], TRUE);
// t3lib_div::devLog('Paragraphs', 'external_import', 0, $paragraphs);
$records[$i]['bodytext'] = '';
foreach ($paragraphs as $aParagraph) {
$aParagraph = trim($aParagraph);
if (!empty($aParagraph)) {
if (strpos($aParagraph, '<p>') === FALSE) {
$records[$i]['bodytext'] .= '<p>' . $aParagraph . '</p>';
} else {
$records[$i]['bodytext'] .= $aParagraph;
// Loop again on all records, to duplicate those that have more than 1 related entry
$newRecordIndex = $numRecords;
for ($i = 0; $i < $numRecords; $i++) {
if (count($relatedEntries[$i]) > 1) {
// Drop the first element
// Loop on the others and create copies of the original record with the new relation
foreach ($relatedEntries[$i] as $entryId) {
$records[$newRecordIndex] = $records[$i];
$records[$newRecordIndex]['related'] = $entryId;
} elseif (($table == 'tx_journal_entries' && $index == 3)) {
$numRecords = count($records);
for ($i = 0; $i < $numRecords; $i++) {
$wpCategoryId = $records[$i]['categories'];
$records[$i]['categories'] = self::$wpCategoriesToDamCategories[$wpCategoryId];
if (isset(self::$additionalDamCategories[$wpCategoryId])) {
$records[$i]['categories'] .= ',' . implode(',', self::$additionalDamCategories[$wpCategoryId]);
return $records;
// External import configuration for WordPress stuff
$commonImportConfiguration = array(
'connector' => 'sql',
'data' => 'array',
'parameters' => array(
'driver' => 'mysql',
'server' => '',
'user' => 'foo',
'password' => 'bar',
'database' => 'wordpress',
'init' => 'SET NAMES utf8'
'minimumRecords' => 1,
'enforcePid' => 1
$index = 0;
$table = 'tx_dam';
$TCA[$table]['ctrl']['external'][$index] = $commonImportConfiguration;
$TCA[$table]['ctrl']['external'][$index]['parameters']['query'] = "SELECT * FROM wp_posts WHERE post_status = 'attachment' AND post_mime_type LIKE 'image%'";
$TCA[$table]['ctrl']['external'][$index]['description'] = 'Import image titles into the DAM (WP)';
$TCA[$table]['ctrl']['external'][$index]['priority'] = 10;
$TCA[$table]['ctrl']['external'][$index]['pid'] = tx_dam_db::getPid();
$TCA[$table]['ctrl']['external'][$index]['disabledOperations'] = 'insert,delete';
$TCA[$table]['ctrl']['external'][$index]['reference_uid'] = 'file_name';
$TCA[$table]['columns']['file_name']['external'][$index]['field'] = 'guid';
$TCA[$table]['columns']['title']['external'][$index]['field'] = 'post_title';
$TCA[$table]['columns']['date_cr']['external'][$index] = array(
'field' => 'post_date',
'userFunc' => array(
'class' => 'EXT:external_import/samples/class.tx_externalimport_transformations.php:&tx_externalimport_transformations',
'method' => 'parseDate',
'params' => array(
'enforceTimeZone' => TRUE
$TCA[$table]['columns']['date_mod']['external'][$index] = $TCA[$table]['columns']['date_cr']['external'][$index];
$index = 1;
$table = 'tx_journal_entries';
$TCA[$table]['ctrl']['external'][$index] = $commonImportConfiguration;
$TCA[$table]['ctrl']['external'][$index]['parameters']['query'] = "
SELECT wp_posts.ID, wp_posts.post_title, wp_posts.post_date, wp_posts.post_content, images.guid
FROM wp_posts
LEFT JOIN wp_posts AS images ON images.post_parent = wp_posts.ID
WHERE wp_posts.post_status = 'publish' AND wp_posts.post_mime_type = '' ORDER BY post_date
$TCA[$table]['ctrl']['external'][$index]['description'] = 'Import posts (WP)';
$TCA[$table]['ctrl']['external'][$index]['priority'] = 20;
$TCA[$table]['ctrl']['external'][$index]['pid'] = 69;
$TCA[$table]['ctrl']['external'][$index]['reference_uid'] = 'wp_id';
$TCA[$table]['columns']['wp_id']['external'][$index]['field'] = 'ID';
$TCA[$table]['columns']['title']['external'][$index]['field'] = 'post_title';
$TCA[$table]['columns']['official_date']['external'][$index] = array(
'field' => 'post_date',
'userFunc' => array(
'class' => 'EXT:external_import/samples/class.tx_externalimport_transformations.php:&tx_externalimport_transformations',
'method' => 'parseDate',
'params' => array(
'enforceTimeZone' => TRUE
$TCA[$table]['columns']['bodytext']['external'][$index] = array(
'field' => 'post_content',
'rteEnabled' => TRUE
$TCA[$table]['columns']['thumbnail']['external'][$index] = array(
'field' => 'guid',
'MM' => array(
'mapping' => array(
'table' => 'tx_dam',
'reference_field' => 'file_name'
$TCA[$table]['columns']['gallery']['external'][$index] = array(
'field' => 'guid',
'MM' => array(
'mapping' => array(
'table' => 'tx_dam',
'reference_field' => 'file_name'
$index = 2;
$TCA[$table]['ctrl']['external'][$index] = $commonImportConfiguration;
$TCA[$table]['ctrl']['external'][$index]['parameters']['query'] = "
SELECT wp_posts.ID, wp_posts.post_content FROM wp_posts
WHERE wp_posts.post_status = 'publish' AND wp_posts.post_mime_type = '' ORDER BY post_date
$TCA[$table]['ctrl']['external'][$index]['description'] = 'Import posts again for link checking (WP)';
$TCA[$table]['ctrl']['external'][$index]['priority'] = 25;
$TCA[$table]['ctrl']['external'][$index]['pid'] = 69;
$TCA[$table]['ctrl']['external'][$index]['disabledOperations'] = 'insert,delete';
$TCA[$table]['ctrl']['external'][$index]['reference_uid'] = 'wp_id';
$TCA[$table]['columns']['wp_id']['external'][$index]['field'] = 'ID';
$TCA[$table]['columns']['bodytext']['external'][$index] = array(
'field' => 'post_content',
'rteEnabled' => TRUE
$TCA[$table]['columns']['related']['external'][$index] = array(
'field' => 'ID',
'MM' => array(
'mapping' => array(
'table' => 'tx_journal_entries',
'reference_field' => 'wp_id'
// Import categories
$index = 3;
$TCA[$table]['ctrl']['external'][$index] = $commonImportConfiguration;
$TCA[$table]['ctrl']['external'][$index]['parameters']['query'] = "
SELECT wp_posts.ID, wp_post2cat.category_id FROM wp_posts
INNER JOIN wp_post2cat ON post_id = ID
WHERE wp_posts.post_status = 'publish' AND wp_posts.post_mime_type = ''
$TCA[$table]['ctrl']['external'][$index]['description'] = 'Import posts again for rebuilding categories (WP)';
$TCA[$table]['ctrl']['external'][$index]['priority'] = 30;
$TCA[$table]['ctrl']['external'][$index]['pid'] = 69;
$TCA[$table]['ctrl']['external'][$index]['disabledOperations'] = 'insert,delete';
$TCA[$table]['ctrl']['external'][$index]['reference_uid'] = 'wp_id';
$TCA[$table]['columns']['wp_id']['external'][$index]['field'] = 'ID';
$TCA[$table]['columns']['categories']['external'][$index]['field'] = 'category_id';
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment