Last active
June 26, 2024 02:03
-
-
Save msaari/a0f3c20c5f7e60b750233802264688e4 to your computer and use it in GitHub Desktop.
Relevanssi attachment indexing server intermediary script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Attachment processing intermediary to work between Relevanssi and a Tika server. | |
* | |
* Installation instructions: | |
* 1. Save this as index.php. | |
* 2. Change the Tika server URL in the constructor to point to your own Tika server. | |
* 3. Upload this file in a directory on your server. | |
* | |
* @author Mikko Saari (mikko@mikkosaari.fi) | |
* @license GNU General Public License 3.0 or later | |
* @see https://www.relevanssi.com/user-manual/attachment-server/ | |
*/ | |
$PdfProcessor = new PdfController(); | |
if ( isset( $_GET['upload'] ) ) { | |
$PdfProcessor->process( $_GET ); | |
} else { | |
$PdfProcessor->process( $_POST ); | |
} | |
class PdfController { | |
private $tmp_path; | |
private $tika_server; | |
public function __construct() { | |
$this->tmp_path = '/tmp/'; | |
$this->tika_server = 'http://www.example.com:9998/tika'; // URL to the Tika server, include the port number and the path /tika | |
} | |
private function getTempPath() { | |
return $this->tmp_path; | |
} | |
private function get_tika_server() { | |
return $this->tika_server; | |
} | |
private function createTempFile($type) { | |
return tempnam($this->getTempPath(), $type . "_") . "." . $type; | |
} | |
/** | |
* Takes the temp file, sends it to Tika, returns the results. | |
*/ | |
private function processTempFile($tempfile) { | |
if ( filesize( $tempfile ) === 0 ) { | |
$last_error = error_get_last(); | |
$this->returnError( 'Empty attachment file. Is the file publicly available? Server error: ' . $last_error['message'] ); | |
} | |
$text = null; | |
$ch = curl_init( $this->get_tika_server() ); | |
curl_setopt( $ch, CURLOPT_PUT, 1 ); | |
$fh_res = fopen( $tempfile, 'r' ); | |
curl_setopt( $ch, CURLOPT_INFILE, $fh_res ); | |
curl_setopt( $ch, CURLOPT_INFILESIZE, filesize( $tempfile ) ); | |
curl_setopt( $ch, CURLOPT_RETURNTRANSFER, 1 ); | |
curl_setopt( $ch, CURLOPT_HTTPHEADER, [ 'Accept: text/plain' ] ); | |
$curl_response_res = curl_exec ( $ch ); | |
fclose( $fh_res ); | |
$text = $curl_response_res; | |
unlink($tempfile); | |
if ( empty( $text ) ) { | |
$this->returnError( 'No text in the file.' ); | |
} | |
$json = json_encode($text); | |
return $json; | |
} | |
/** | |
* Fetches the attachment file, saves it in a temp file and calls | |
* processTempFile() for processing. | |
*/ | |
public function processPDF($url = null) { | |
$url_parts = parse_url( $url ); | |
$path_parts = explode( '/', $url_parts['path'] ); | |
$filename = array_pop( $path_parts ); | |
$filename_parts = explode( '.', $filename ); | |
$extension = array_pop( $filename_parts ); | |
if ( rawurlencode( urldecode( $filename ) ) !== $filename ) { | |
array_push( $path_parts, rawurlencode( $filename ) ); | |
} else { | |
array_push( $path_parts, $filename ); | |
} | |
$url_parts['path'] = '/' . rawurlencode( substr( $url_parts['path'], 1 ) ); | |
$url_parts['query'] = isset( $url_parts['query'] ) ? rawurlencode( $url_parts['query'] ) : ''; | |
$url_parts['fragment'] = isset( $url_parts['fragment'] ) ? rawurlencode( $url_parts['fragment'] ) : ''; | |
$url = http_build_url( $url_parts ); | |
if (filter_var($url, FILTER_VALIDATE_URL) === false) { | |
$this->returnError("Not a valid URL."); | |
} | |
$tempfile = $this->createTempFile($extension); | |
if ( '.' . $extension === $tempfile ) { | |
$this->returnError( 'Could not access the PDF file.' ); | |
} | |
file_put_contents($tempfile, fopen($url, 'r')); | |
return $this->processTempFile($tempfile); | |
} | |
/** | |
* Takes the uploaded file, saves it in a temp file and calls | |
* processTempFile() for processing. | |
*/ | |
public function processUploadedPDF($upload = null) { | |
$pdf_content = file_get_contents('php://input'); | |
$tempfile = $this->createTempFile("pdf"); | |
if ( '.pdf' === $tempfile ) { | |
$this->returnError( 'Could not access the PDF file.' ); | |
} | |
file_put_contents($tempfile, $pdf_content); | |
return $this->processTempFile($tempfile); | |
} | |
private function returnError($msg) { | |
error_log($msg); | |
header('HTTP/1.0 500 Internal Server Error'); | |
die(json_encode(array('error' => "PDF Processor error: " . $msg))); | |
} | |
/** | |
* Starts the processing and calls the right processing function. | |
*/ | |
public function process($data) { | |
if (empty($data)) { | |
echo "Relevanssi attachment handling services is up and running."; | |
exit(); | |
} | |
/* | |
if (!isset($data['key'])) { | |
$this->returnError("Key is missing."); | |
} | |
if (!$this->isValidKey($data['key'])) { | |
$this->returnError("Key " . $data['key'] . " is not valid."); | |
} | |
*/ | |
if (isset($data['url'])) { | |
$text = $this->processPDF($data['url']); | |
die($text); | |
} | |
if (isset($data['upload'])) { | |
$text = $this->processUploadedPDF($data['upload']); | |
die($text); | |
} | |
$this->returnError("No action selected."); | |
} | |
} | |
if (!defined('HTTP_URL_REPLACE')) { | |
define('HTTP_URL_REPLACE', 1); | |
} | |
if (!defined('HTTP_URL_JOIN_PATH')) { | |
define('HTTP_URL_JOIN_PATH', 2); | |
} | |
if (!defined('HTTP_URL_JOIN_QUERY')) { | |
define('HTTP_URL_JOIN_QUERY', 4); | |
} | |
if (!defined('HTTP_URL_STRIP_USER')) { | |
define('HTTP_URL_STRIP_USER', 8); | |
} | |
if (!defined('HTTP_URL_STRIP_PASS')) { | |
define('HTTP_URL_STRIP_PASS', 16); | |
} | |
if (!defined('HTTP_URL_STRIP_AUTH')) { | |
define('HTTP_URL_STRIP_AUTH', 32); | |
} | |
if (!defined('HTTP_URL_STRIP_PORT')) { | |
define('HTTP_URL_STRIP_PORT', 64); | |
} | |
if (!defined('HTTP_URL_STRIP_PATH')) { | |
define('HTTP_URL_STRIP_PATH', 128); | |
} | |
if (!defined('HTTP_URL_STRIP_QUERY')) { | |
define('HTTP_URL_STRIP_QUERY', 256); | |
} | |
if (!defined('HTTP_URL_STRIP_FRAGMENT')) { | |
define('HTTP_URL_STRIP_FRAGMENT', 512); | |
} | |
if (!defined('HTTP_URL_STRIP_ALL')) { | |
define('HTTP_URL_STRIP_ALL', 1024); | |
} | |
if (!function_exists('http_build_url')) { | |
/** | |
* Build a URL. | |
* | |
* The parts of the second URL will be merged into the first according to | |
* the flags argument. | |
* | |
* @param mixed $url (part(s) of) an URL in form of a string or | |
* associative array like parse_url() returns | |
* @param mixed $parts same as the first argument | |
* @param int $flags a bitmask of binary or'ed HTTP_URL constants; | |
* HTTP_URL_REPLACE is the default | |
* @param array $new_url if set, it will be filled with the parts of the | |
* composed url like parse_url() would return | |
* @return string | |
*/ | |
function http_build_url($url, $parts = array(), $flags = HTTP_URL_REPLACE, &$new_url = array()) | |
{ | |
is_array($url) || $url = parse_url($url); | |
is_array($parts) || $parts = parse_url($parts); | |
isset($url['query']) && is_string($url['query']) || $url['query'] = null; | |
isset($parts['query']) && is_string($parts['query']) || $parts['query'] = null; | |
$keys = array('user', 'pass', 'port', 'path', 'query', 'fragment'); | |
// HTTP_URL_STRIP_ALL and HTTP_URL_STRIP_AUTH cover several other flags. | |
if ($flags & HTTP_URL_STRIP_ALL) { | |
$flags |= HTTP_URL_STRIP_USER | HTTP_URL_STRIP_PASS | |
| HTTP_URL_STRIP_PORT | HTTP_URL_STRIP_PATH | |
| HTTP_URL_STRIP_QUERY | HTTP_URL_STRIP_FRAGMENT; | |
} elseif ($flags & HTTP_URL_STRIP_AUTH) { | |
$flags |= HTTP_URL_STRIP_USER | HTTP_URL_STRIP_PASS; | |
} | |
// Schema and host are alwasy replaced | |
foreach (array('scheme', 'host') as $part) { | |
if (isset($parts[$part])) { | |
$url[$part] = $parts[$part]; | |
} | |
} | |
if ($flags & HTTP_URL_REPLACE) { | |
foreach ($keys as $key) { | |
if (isset($parts[$key])) { | |
$url[$key] = $parts[$key]; | |
} | |
} | |
} else { | |
if (isset($parts['path']) && ($flags & HTTP_URL_JOIN_PATH)) { | |
if (isset($url['path']) && substr($parts['path'], 0, 1) !== '/') { | |
// Workaround for trailing slashes | |
$url['path'] .= 'a'; | |
$url['path'] = rtrim( | |
str_replace(basename($url['path']), '', $url['path']), | |
'/' | |
) . '/' . ltrim($parts['path'], '/'); | |
} else { | |
$url['path'] = $parts['path']; | |
} | |
} | |
if (isset($parts['query']) && ($flags & HTTP_URL_JOIN_QUERY)) { | |
if (isset($url['query'])) { | |
parse_str($url['query'], $url_query); | |
parse_str($parts['query'], $parts_query); | |
$url['query'] = http_build_query( | |
array_replace_recursive( | |
$url_query, | |
$parts_query | |
) | |
); | |
} else { | |
$url['query'] = $parts['query']; | |
} | |
} | |
} | |
if (isset($url['path']) && $url['path'] !== '' && substr($url['path'], 0, 1) !== '/') { | |
$url['path'] = '/' . $url['path']; | |
} | |
foreach ($keys as $key) { | |
$strip = 'HTTP_URL_STRIP_' . strtoupper($key); | |
if ($flags & constant($strip)) { | |
unset($url[$key]); | |
} | |
} | |
$parsed_string = ''; | |
if (!empty($url['scheme'])) { | |
$parsed_string .= $url['scheme'] . '://'; | |
} | |
if (!empty($url['user'])) { | |
$parsed_string .= $url['user']; | |
if (isset($url['pass'])) { | |
$parsed_string .= ':' . $url['pass']; | |
} | |
$parsed_string .= '@'; | |
} | |
if (!empty($url['host'])) { | |
$parsed_string .= $url['host']; | |
} | |
if (!empty($url['port'])) { | |
$parsed_string .= ':' . $url['port']; | |
} | |
if (!empty($url['path'])) { | |
$parsed_string .= $url['path']; | |
} | |
if (!empty($url['query'])) { | |
$parsed_string .= '?' . $url['query']; | |
} | |
if (!empty($url['fragment'])) { | |
$parsed_string .= '#' . $url['fragment']; | |
} | |
$new_url = $url; | |
return $parsed_string; | |
} | |
} |
Thank you!
I noticed that this PHP file only requests meta information from Tika Server but not the actual document text. This causes Relevanssi search to not work properly.
For Tika Server 2.7.0.0 following line needs to be changed:
$this->tika_server = 'http://www.example.com:9998/meta';
to
$this->tika_server = 'http://www.example.com:9998/tika';
The endpoint must be /tika not /meta
Then, in function processTempFile, add following line somewhere between the curl_setopt
lines:
curl_setopt( $ch, CURLOPT_HTTPHEADER, [ 'Accept: text/plain' ] );
With these changes in place, Tika Server will only return the content of the documents as plaintext and then Relevanssi Search will work.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Sure.