Skip to content

Instantly share code, notes, and snippets.

@codedokode
Created June 5, 2018 19:56
Show Gist options
  • Save codedokode/590f9094988bbd22db7164bb83e54405 to your computer and use it in GitHub Desktop.
Save codedokode/590f9094988bbd22db7164bb83e54405 to your computer and use it in GitHub Desktop.
Скрипт исправления сломанных тредов 1, 4b и 15
<?php
use Symfony\Component\DomCrawler\Crawler;
use phpClub\Util\DOMUtil;
set_time_limit(0);
require __DIR__ . '/../vendor/autoload.php';
$di = require __DIR__ . '/../src/Bootstrap.php';
/**
* Исправляет 1, 4b и 15 тред, в котором много сломанных кусков HTML.
*
* Использование: php script.php <input >output
*/
/*
* Поиск проблем в верстке
*
* ="[^"<>]*(<|>[^>])
* <[^<>"]*?[^=>]"
* <([^<>"]|"[^"<>]{0,100}?"){0,100}?"[^"<>]*(<|>[^>])
*/
$html = file_get_contents('php://stdin');
$stderr = fopen('php://stderr', 'a');
$dupPostsOk = in_array('--dup-posts-ok', $argv);
// Определяем номер треда
if (false !== strstr($html, 'thread_236463')) {
$threadNo = 1;
} elseif (false !== strstr($html, 'thread_345388')) {
$threadNo = 15;
} elseif (false !== strstr($html, 'thread_280501')) {
$threadNo = '4b';
} else {
fprintf($stderr, "Nothing to fix here\n");
echo $html;
exit(0);
}
$thread1Fixes = [
'<a ">' => '',
'<span class="postertripr />' => '',
'<blockquofom mobile">' => '',
'<a onmouseover="showPostPreviepan>' => '',
'</spclass="postinfom>' => '',
'<span class="postpне давал.' => '',
'<blockqw(event)"' => '',
'<a class="postbtn_adm" href="http://2ch.hk/pr/res/236463.html#" style="display:none" onclick="javascript:addAdminMenu(this); return false;" onmouseout="javasмотреть. ' => '',
];
$thread15Fixes = [
'<div i); return false;">' => '',
'<blocst_347440" class="post">' => '',
'<a onmouseover="showPostPreview(event)" onmouseout="delPostь' => '',
'<table id="poишешь' => '',
'<span clabtn_rep" href="#" onclick="javascript:addQuickReply(\'347680\'); return false;">' => '',
'<span class="subjecss=" unkfunc">' => '',
'<a onmouseover="showP" reflink1">' => '',
'<span data-utc="1339778953" pr res 345388.html#348611">' => '',
'<a onmouseover="showPostPrevclass=" datetime postnum">' => '',
];
$thread4bFixes = [
'<span class="posternamlockquote id=" m281173">' => '',
'<a onmouseover="showPostPreview(event)" onmotpanel">' => '',
'</spa">' => '',
'<a class="postbtn_adm" href="http://2ch.pm/pr/res/280501.html#" style="display:none" onclick="javascript:addAdminMenu(tливость' => '',
'<a class="postbtn_adm" href="http://2ch.pm/pr/res/280501.html#" style="display:none" onclick="his);" return="" false;"="" onmouseout="javascript:removeAdminMenu(event); return false;">' => '',
'<input type="checkbox" name="delete" clm mobile">' => '',
'<a href="http://2ch.pm/pr/res/280501.html#286ass=" turnmeoff"="" value="286625">' => '',
'<a class="postbtn_adm" href="http://2ch.pm/pr/res/280501.html#" style="display:none" onclick="javascript:addAdminMenu(this); return false;" onmouseout="javascripts=" unkfunc"="">' => '',
'<span class="ref=" pr res 280501.html#288346">' => '',
];
if ($threadNo == 1) {
$html = strtr($html, $thread1Fixes);
$headerTill = '<div id="thread_236463" class="thread">';
$footerFrom = '</div>[<a href="http://2ch.hk/pr/wakaba.html">Назад</a>]';
} elseif ($threadNo == '4b') {
$html = strtr($html, $thread4bFixes);
$headerTill = '<div id="thread_280501" class="thread">';
$footerFrom = '</div>[<a href="';
} elseif ($threadNo == 15) {
$html = strtr($html, $thread15Fixes);
$headerTill = '<div id="thread_345388" class="thread">';
$footerFrom = '</div>[<a href="http://2ch.hk/pr/wakaba.html">Назад</a>]';
}
list($header, $rest) = explode($headerTill, $html, 2);
$header .= $headerTill;
list($body, $footer) = explode($footerFrom, $rest, 2);
$footer = $footerFrom . $footer;
if (strlen($html) !== strlen($header) + strlen($body) + strlen($footer)) {
throw new \Exception("Data lost while splitting");
}
if ($dupPostsOk) {
$newBody = $html;
} else {
$bodyCrawler = new Crawler($body);
$posts = $bodyCrawler->filterXPath('//body/*');
$newNodes = fixDupPosts($posts, $stderr);
$newBodyParts = [];
foreach ($newNodes as $node) {
$nodeHtml = DOMUtil::getOuterHtml($node);
$newBodyParts[] = $nodeHtml;
}
$newBody = implode("\n", $newBodyParts);
fprintf(
$stderr,
"Old body: %d bytes, %d posts, new: %d bytes, %d posts\n",
strlen($body),
$posts->count(),
strlen($newBody),
$newNodes->count()
);
}
echo $header . $newBody . $footer;
function fixDupPosts(Crawler $posts, $stderr)
{
fprintf($stderr, "%d posts found\n", $posts->count());
$newNodes = new Crawler;
$old = [];
$dupIds = [];
$posts->each(function ($post) use (&$old, &$dupIds, $stderr) {
$id = $post->attr('id');
// В id бывают пробелы
$id = trim($id);
if (!$id) {
fprintf($stderr, "Empty id at html, skip it: %s\n\n", DOMUtil::getOuterHtml($post->getNode(0)));
return;
}
if (array_key_exists($id, $old)) {
$dupIds[] = $id;
}
$old[$id] = $post;
});
fprintf($stderr, "%d nodes replaced\n", count($dupIds));
// $countedIds = array_count_values($dupIds);
// foreach ($countedIds as $id => $num) {
// fprintf($stderr, "%s x%s\t", $id, $num);
// }
// fprintf($stderr, "\n");
foreach ($old as $node) {
$newNodes->addNode($node->getNode(0));
}
return $newNodes;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment