Skip to content

Instantly share code, notes, and snippets.

@msaari
Last active Apr 23, 2020
Embed
What would you like to do?
Relevanssi attachment indexing server intermediary script
<?php
/**
* Attachment processing intermediary to work between Relevanssi and a Tika server.
*
* Installation instructions:
* 1. Save this as index.php.
* 2. Change the Tika server URL in the constructor to point to your own Tika server.
* 3. Upload this file in a directory on your server.
*
* @author Mikko Saari (mikko@mikkosaari.fi)
* @license GNU General Public License
* @see https://www.relevanssi.com/user-manual/attachment-server/
*/
$PdfProcessor = new PdfController();
if ( isset( $_GET['upload'] ) ) {
$PdfProcessor->process( $_GET );
} else {
$PdfProcessor->process( $_POST );
}
class PdfController {
private $tmp_path;
private $tika_server;
public function __construct() {
$this->tmp_path = '/tmp/';
$this->tika_server = 'http://www.example.com/'; // URL to the Tika server
}
private function getTempPath() {
return $this->tmp_path;
}
private function get_tika_server() {
return $this->tika_server;
}
private function createTempFile($type) {
return tempnam($this->getTempPath(), $type . "_") . "." . $type;
}
/**
* Takes the temp file, sends it to Tika, returns the results.
*/
private function processTempFile($tempfile) {
if ( filesize( $tempfile ) === 0 ) {
$last_error = error_get_last();
$this->returnError( 'Empty attachment file. Is the file publicly available? Server error: ' . $last_error['message'] );
}
$text = null;
$ch = curl_init( $this->get_tika_server() );
curl_setopt( $ch, CURLOPT_PUT, 1 );
$fh_res = fopen( $tempfile, 'r' );
curl_setopt( $ch, CURLOPT_INFILE, $fh_res );
curl_setopt( $ch, CURLOPT_INFILESIZE, filesize( $tempfile ) );
curl_setopt( $ch, CURLOPT_RETURNTRANSFER, 1 );
$curl_response_res = curl_exec ( $ch );
fclose( $fh_res );
$text = $curl_response_res;
unlink($tempfile);
if ( empty( $text ) ) {
$this->returnError( 'No text in the file.' );
}
$json = json_encode($text);
return $json;
}
/**
* Fetches the attachment file, saves it in a temp file and calls
* processTempFile() for processing.
*/
public function processPDF($url = null) {
$url_parts = parse_url( $url );
$url_parts['path'] = '/' . rawurlencode( substr( $url_parts['path'], 1 ) );
$url_parts['query'] = isset( $url_parts['query'] ) ? rawurlencode( $url_parts['query'] ) : '';
$url_parts['fragment'] = isset( $url_parts['fragment'] ) ? rawurlencode( $url_parts['fragment'] ) : '';
$url = http_build_url( $url_parts );
if (filter_var($url, FILTER_VALIDATE_URL) === false) {
$this->returnError("Not a valid URL.");
}
$tempfile = $this->createTempFile("pdf");
if ( '.pdf' === $tempfile ) {
$this->returnError( 'Could not access the PDF file.' );
}
file_put_contents($tempfile, fopen($url, 'r'));
return $this->processTempFile($tempfile);
}
/**
* Takes the uploaded file, saves it in a temp file and calls
* processTempFile() for processing.
*/
public function processUploadedPDF($upload = null) {
$pdf_content = file_get_contents('php://input');
$tempfile = $this->createTempFile("pdf");
if ( '.pdf' === $tempfile ) {
$this->returnError( 'Could not access the PDF file.' );
}
file_put_contents($tempfile, $pdf_content);
return $this->processTempFile($tempfile);
}
private function returnError($msg) {
error_log($msg);
header('HTTP/1.0 500 Internal Server Error');
die(json_encode(array('error' => "PDF Processor error: " . $msg)));
}
/**
* Starts the processing and calls the right processing function.
*/
public function process($data) {
if (empty($data)) {
echo "Relevanssi attachment handling services is up and running.";
exit();
}
/*
if (!isset($data['key'])) {
$this->returnError("Key is missing.");
}
if (!$this->isValidKey($data['key'])) {
$this->returnError("Key " . $data['key'] . " is not valid.");
}
*/
if (isset($data['url'])) {
$text = $this->processPDF($data['url']);
die($text);
}
if (isset($data['upload'])) {
$text = $this->processUploadedPDF($data['upload']);
die($text);
}
$this->returnError("No action selected.");
}
}
if (!defined('HTTP_URL_REPLACE')) {
define('HTTP_URL_REPLACE', 1);
}
if (!defined('HTTP_URL_JOIN_PATH')) {
define('HTTP_URL_JOIN_PATH', 2);
}
if (!defined('HTTP_URL_JOIN_QUERY')) {
define('HTTP_URL_JOIN_QUERY', 4);
}
if (!defined('HTTP_URL_STRIP_USER')) {
define('HTTP_URL_STRIP_USER', 8);
}
if (!defined('HTTP_URL_STRIP_PASS')) {
define('HTTP_URL_STRIP_PASS', 16);
}
if (!defined('HTTP_URL_STRIP_AUTH')) {
define('HTTP_URL_STRIP_AUTH', 32);
}
if (!defined('HTTP_URL_STRIP_PORT')) {
define('HTTP_URL_STRIP_PORT', 64);
}
if (!defined('HTTP_URL_STRIP_PATH')) {
define('HTTP_URL_STRIP_PATH', 128);
}
if (!defined('HTTP_URL_STRIP_QUERY')) {
define('HTTP_URL_STRIP_QUERY', 256);
}
if (!defined('HTTP_URL_STRIP_FRAGMENT')) {
define('HTTP_URL_STRIP_FRAGMENT', 512);
}
if (!defined('HTTP_URL_STRIP_ALL')) {
define('HTTP_URL_STRIP_ALL', 1024);
}
if (!function_exists('http_build_url')) {
/**
* Build a URL.
*
* The parts of the second URL will be merged into the first according to
* the flags argument.
*
* @param mixed $url (part(s) of) an URL in form of a string or
* associative array like parse_url() returns
* @param mixed $parts same as the first argument
* @param int $flags a bitmask of binary or'ed HTTP_URL constants;
* HTTP_URL_REPLACE is the default
* @param array $new_url if set, it will be filled with the parts of the
* composed url like parse_url() would return
* @return string
*/
function http_build_url($url, $parts = array(), $flags = HTTP_URL_REPLACE, &$new_url = array())
{
is_array($url) || $url = parse_url($url);
is_array($parts) || $parts = parse_url($parts);
isset($url['query']) && is_string($url['query']) || $url['query'] = null;
isset($parts['query']) && is_string($parts['query']) || $parts['query'] = null;
$keys = array('user', 'pass', 'port', 'path', 'query', 'fragment');
// HTTP_URL_STRIP_ALL and HTTP_URL_STRIP_AUTH cover several other flags.
if ($flags & HTTP_URL_STRIP_ALL) {
$flags |= HTTP_URL_STRIP_USER | HTTP_URL_STRIP_PASS
| HTTP_URL_STRIP_PORT | HTTP_URL_STRIP_PATH
| HTTP_URL_STRIP_QUERY | HTTP_URL_STRIP_FRAGMENT;
} elseif ($flags & HTTP_URL_STRIP_AUTH) {
$flags |= HTTP_URL_STRIP_USER | HTTP_URL_STRIP_PASS;
}
// Schema and host are alwasy replaced
foreach (array('scheme', 'host') as $part) {
if (isset($parts[$part])) {
$url[$part] = $parts[$part];
}
}
if ($flags & HTTP_URL_REPLACE) {
foreach ($keys as $key) {
if (isset($parts[$key])) {
$url[$key] = $parts[$key];
}
}
} else {
if (isset($parts['path']) && ($flags & HTTP_URL_JOIN_PATH)) {
if (isset($url['path']) && substr($parts['path'], 0, 1) !== '/') {
// Workaround for trailing slashes
$url['path'] .= 'a';
$url['path'] = rtrim(
str_replace(basename($url['path']), '', $url['path']),
'/'
) . '/' . ltrim($parts['path'], '/');
} else {
$url['path'] = $parts['path'];
}
}
if (isset($parts['query']) && ($flags & HTTP_URL_JOIN_QUERY)) {
if (isset($url['query'])) {
parse_str($url['query'], $url_query);
parse_str($parts['query'], $parts_query);
$url['query'] = http_build_query(
array_replace_recursive(
$url_query,
$parts_query
)
);
} else {
$url['query'] = $parts['query'];
}
}
}
if (isset($url['path']) && $url['path'] !== '' && substr($url['path'], 0, 1) !== '/') {
$url['path'] = '/' . $url['path'];
}
foreach ($keys as $key) {
$strip = 'HTTP_URL_STRIP_' . strtoupper($key);
if ($flags & constant($strip)) {
unset($url[$key]);
}
}
$parsed_string = '';
if (!empty($url['scheme'])) {
$parsed_string .= $url['scheme'] . '://';
}
if (!empty($url['user'])) {
$parsed_string .= $url['user'];
if (isset($url['pass'])) {
$parsed_string .= ':' . $url['pass'];
}
$parsed_string .= '@';
}
if (!empty($url['host'])) {
$parsed_string .= $url['host'];
}
if (!empty($url['port'])) {
$parsed_string .= ':' . $url['port'];
}
if (!empty($url['path'])) {
$parsed_string .= $url['path'];
}
if (!empty($url['query'])) {
$parsed_string .= '?' . $url['query'];
}
if (!empty($url['fragment'])) {
$parsed_string .= '#' . $url['fragment'];
}
$new_url = $url;
return $parsed_string;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment