Last active
October 9, 2015 12:20
-
-
Save dmitryd/5fd4100e5542ada38e1e to your computer and use it in GitHub Desktop.
Indexing file content in TYPO3 6.2 with Solr
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
namespace My\solrfileindexing\Xclass; | |
use TYPO3\CMS\Core\Utility\GeneralUtility; | |
/** | |
* Requires cc_text* extensions! | |
*/ | |
class DocumentFactory extends \TYPO3\Solr\Solrfal\Indexing\DocumentFactory { | |
/** | |
* @param \Apache_Solr_Document $document | |
* @param \TYPO3\CMS\Core\Resource\File $file | |
* @return void | |
*/ | |
protected function addFileInformation(\Apache_Solr_Document $document, \TYPO3\CMS\Core\Resource\File $file) { | |
$document->setField('content', $this->extractContent($file)); | |
parent::addFileInformation($document, $file); | |
} | |
/** | |
* | |
* | |
* @param \TYPO3\CMS\Core\Resource\File $file | |
* @return string | |
*/ | |
protected function extractContent(\TYPO3\CMS\Core\Resource\File $file) { | |
$fileContent = ''; | |
$mimeType = $file->getMimeType(); | |
if ($mimeType == 'text/plain') { | |
// we can read text files directly | |
$fileContent = $file->getContents(); | |
} | |
else { | |
// other subtypes should be handled by the text service | |
$serviceChain = ''; | |
while (empty($fileContent) && ($service = GeneralUtility::makeInstanceService('textExtract', $file->getExtension(), $serviceChain))) { | |
$serviceChain .= ',' . $service->getServiceKey(); | |
$filePath = $file->getForLocalProcessing(FALSE); | |
$service->setInputFile($filePath, $file->getExtension()); | |
$serviceConfiguration = array('wantedCharset' => 'utf-8'); | |
$service->process('', '', $serviceConfiguration); | |
$fileContent = trim($service->getOutput()); | |
} | |
} | |
$contentExtractor = GeneralUtility::makeInstance('Tx_Solr_HtmlContentExtractor', $fileContent); | |
/** @var \Tx_Solr_HtmlContentExtractor $contentExtractor */ | |
return $contentExtractor->getIndexableContent(); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
$GLOBALS['TYPO3_CONF_VARS']['SYS']['Objects']['TYPO3\\Solr\\Solrfal\\Indexing\\DocumentFactory'] = array( | |
'className' => 'My\\solrfileindexing\\Xclass\\DocumentFactory', | |
); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment