Skip to content

Instantly share code, notes, and snippets.

@mytory
Last active March 1, 2016 10:51
Show Gist options
  • Save mytory/96e17f9386ee1afbb97e to your computer and use it in GitHub Desktop.
Save mytory/96e17f9386ee1afbb97e to your computer and use it in GitHub Desktop.
Created by [gnoownow10](https://github.com/gnoownow10). docx, doc, hwp to html.
<?php
/**
* @see dependencies: libreoffice, pyhwp https://pythonhosted.org/pyhwp/ko/
* Class Anything2html
*/
class Anything2html {
static $hwp5html;
static $libreoffice;
static $error;
static function convert($content, $extension, $default_handler, $error_handler) {
static::$error = false;
$handler = "{$extension}_handler";
$can_handle = method_exists(__CLASS__, $handler);
if ($can_handle) {
try {
return call_user_func([__CLASS__, $handler], $content);
} catch (ExecFailedException $e) {
static::$error = true;
return call_user_func($error_handler, $e);
}
} else {
static::$error = true;
return call_user_func($default_handler, $content);
}
}
static function executable_exists($cmd) {
exec("which $cmd", $output, $exit_code);
return (bool) $output;
}
static function get_body_inner_html($html) {
$dom = new DOMDocument();
$dom->loadXML($html);
$body = $dom->getElementsByTagName('body')->item(0);
$inner_html = '';
foreach ($body->childNodes as $child) { $inner_html .= $dom->saveHTML($child); }
return $inner_html;
}
static function txt_handler($content) {
$content = htmlentities($content);
return self::html_handler("<meta charset='UTF-8'><pre>$content</pre>");
}
static function html_handler($content) {
$dom = new DOMDocument();
$dom->loadHTML(self::strip_invalid_xml_chars($content));
return $dom->saveXML();
}
static function docx_handler($content, $extension = 'docx') {
if (! self::executable_exists(self::$libreoffice)) {
throw new Exception("libreoffice executable is not found");
}
# prepare file names.
$workspace = self::get_workspace();
$origin_file = $workspace . DIRECTORY_SEPARATOR . "document.$extension";
$out_dir = $workspace . DIRECTORY_SEPARATOR . 'html';
$out_file = $out_dir . DIRECTORY_SEPARATOR . "document.html";
file_put_contents($origin_file, $content);
# execute the command
$cmd = sprintf("export HOME={$workspace} && " . self::$libreoffice." --headless --convert-to html:HTML %s --outdir %s 2>&1",
escapeshellarg($origin_file),
escapeshellarg($out_dir)
);
exec($cmd, $output, $exit_code);
# sometimes libreoffice returns 139 even if it already did conversion fine
$has_error = ($exit_code !== 0) && ($exit_code !== 139);
if ($has_error) {
throw new ExecFailedException("Failed to convert to html", $exit_code, $cmd, $output);
}
$dom = new DOMDOcument();
$dom->loadHTML(file_get_contents($out_file));
$html = $dom->saveXML($dom->documentElement);
return $html;
}
static function hwp_handler($content) {
if (! self::executable_exists(self::$hwp5html)) {
throw new Exception("hwp5html executable is not found.");
}
# prepare file names
$workspace = self::get_workspace();
$origin_file = $workspace . DIRECTORY_SEPARATOR . "document.hwp";
$out_dir = $workspace . DIRECTORY_SEPARATOR . "html";
$out_file = $out_dir . DIRECTORY_SEPARATOR . 'index.xhtml';
file_put_contents($origin_file, $content);
# execute the command
$cmd = sprintf(self::$hwp5html." %s %s 2>&1",
escapeshellarg($origin_file),
$out_dir
);
exec($cmd, $output, $exit_code);
$has_error = ($exit_code !== 0);
if ($has_error) {
throw new ExecFailedException("Failed to convert to html", $exit_code, $cmd, $output);
}
return file_get_contents($out_file);
}
static function doc_handler($content) {
return self::docx_handler($content, 'doc');
}
static function strip_invalid_xml_chars($content) {
// See http://www.w3.org/TR/xml/#charsets
return preg_replace('/
[^
\x{9}
\x{A}
\x{D}
\x{20}-\x{D7FF}
\x{E000}-\x{FFFD}
\x{10000}-\x{10FFF}
]/ux', '',
$content
);
}
private static function get_workspace() {
// 랜덤 파일 생성해서 이름을 받고. 기존에 exec로 output을 받던 것은 맥에서 호환성 문제가 발생해 사용하지 않음.
$workspace = tempnam(sys_get_temp_dir(), 'a2html.');
// 파일은 지우고, 디렉토리를 만듦.
unlink($workspace);
if( ! is_dir($workspace)){
mkdir($workspace, 0777, true);
}
register_shutdown_function(function () use ($workspace) {
call_user_func(['Anything2html', 'rm_rf'], $workspace);
});
return $workspace;
}
private static function rm_rf($path) {
if(strstr($path, sys_get_temp_dir())){
@exec(sprintf("rm -rf %s", escapeshellarg($path)));
}else{
echo 'You do not rm folder that is not temp.';
exit;
}
}
}
/**
* Class ExecFailedException
*/
class ExecFailedException extends Exception {
public $cmd;
public $output;
/**
* @param string $message
* @param int $code
* @param string $cmd
* @param string $output
* @param Exception $previous
*/
function __construct($message, $code, $cmd, $output, $previous = null) {
$this->cmd = $cmd;
$this->output = implode("\n", $output);
parent::__construct($message, $code, $previous);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment