Skip to content

Instantly share code, notes, and snippets.

@karlazz
Created September 23, 2020 02:01
Show Gist options
  • Save karlazz/6ac5b6e7e36803abe55e798d15d63970 to your computer and use it in GitHub Desktop.
Save karlazz/6ac5b6e7e36803abe55e798d15d63970 to your computer and use it in GitHub Desktop.
scraper example uses simple html dom integrates with wordpress
<?php
include ('./simple_html_dom.php');
include ('./wp-load.php');
function custom_wpkses_allow_iframe_here( $tags, $context ) {
if ( 'post' === $context ) {
$tags['iframe'] = array(
'src' => true,
'height' => true,
'width' => true,
'frameborder' => true,
'allowfullscreen' => true,
);
}
return $tags;
}
add_filter( 'wp_kses_allowed_html', 'custom_wpkses_allow_iframe_here', 10, 2 );
$file = file_get_contents('to_be_scraped.txt');
// _yoast_wpseo_metadesc
// meta name="description"
$pages = explode("\n", $file);
echo "Pages: " . count($pages) . "\n";
$i=0;
foreach ($pages as $p) {
$html = @file_get_html($p);
if (!$html) {
echo "\nError: " . $p . "\n";
continue;
}
$args = [];
$ret = $html->find('h2[class=blog-title]');
$args['post_title'] = trim($ret[0]->plaintext);
$ret = $html->find('span[class=date-text]');
$date = trim($ret[0]->plaintext);
$args['post_date_gmt'] = $args['post_date'] = date("Y-m-d 00:00:00", strtotime($date));
$args['post_modified_gmt'] = $args['post_modified'] = date("Y-m-d 00:00:00", strtotime($date));
$ret = $html->find('div[class=blog-content]');
$pc = str_replace('/uploads/1/2/6/5/126588896/','/xxxxx/files/legacy/', $ret[0] );
$pc = str_replace('preload="none" data-autostart="no" data-artist="" data-track="Audio Track"','', $pc );
$pc = str_replace('<audio','[audio',$pc);
$pc = str_replace("></audio>","][/audio]",$pc);
$pc = str_replace('class="wsite-mejs-align-left wsite-mejs-dark" src','src',$pc);
$args['post_content'] = $pc ;
$args['post_status'] = 'publish';
$args['filter'] = true;
$args['meta_input'] =
//echo $args['post_content'];
$ret = $html->find('meta[name=description]');
if (!$ret) {
$ret = $html->find('meta[property=og:description]');
}
$pc = $ret[0]->content;
$ret = $html->find('meta[name=title]');
if (!$ret) {
$ret = $html->find('meta[property=og:title]');
}
$tc = $ret[0]->content;
$args['meta_input'] = ['_yoast_wpseo_metadesc' => $pc, '_yoast_wpseo_title'=> $tc, ];
//echo 'O: ' . $pc . "\n\n";
wp_insert_post ($args);
$i++;
}
echo $i . "\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment