Skip to content

Instantly share code, notes, and snippets.

@kasparsd
Created August 29, 2013 20:15
Show Gist options
  • Save kasparsd/6382878 to your computer and use it in GitHub Desktop.
Save kasparsd/6382878 to your computer and use it in GitHub Desktop.
Parsing HTML using querypath library and creating a JSON feed of all blog posts
<?php
require 'querypath/src/qp.php';
$posts = array();
$map = array(
'kl.' => ' ',
'.' => ' ',
'des' => 'dec',
'mai' => 'may',
'okt' => 'oct'
);
foreach ( glob('posts/*.html') as $file ) {
$html = file_get_contents( $file );
$date = current( explode( '_', basename( $file ) ) );
if ( ! is_numeric( $date ) || $date < 100000 )
continue;
$tags = array();
foreach( htmlqp( $html, '#main .meta .tags a' ) as $tag )
$tags[] = $tag->text();
$comments = array();
foreach( htmlqp( $html, '#comments .comment' ) as $comment )
$comments[] = array(
'comment_author' => strip_tags( $comment->find('.meta h4')->text() ),
'comment_author_url' => $comment->find('.meta h4 a')->attr('href'),
'comment_content' => $comment->find('.content')->text(),
'comment_date' => date( 'r', strtotime( str_replace( array_keys($map), array_values($map), $comment->find('.meta p')->text() ) ) ),
);
$post = array(
'post_title' => htmlqp($html, '.entry h2')->text(),
'post_content' => htmlqp( $html, '.content' )->children()->html(),
'post_date' => date( 'r', $date ),
'categories' => htmlqp( $html, '#main .meta:eq(3) li:first a' )->text(),
'tags' => $tags,
'comments' => $comments
);
$posts[] = $post;
/*
$item = array(
sprintf( '<title>%s</title>', $post['post_title'] ),
sprintf( '<pubDate>%s</pubDate>', $post['post_date'] ),
sprintf( '<content:encoded><![CDATA[%s]]></content:encoded>', $post['post_content'] ),
sprintf( '<category domain="category"><![CDATA[%s]]></category>', $post['categories'] ),
'<wp:post_type>post</wp:post_type>'
);
foreach ( $post['tags'] as $tag )
$item[] = sprintf( '<category domain="post_tag"><![CDATA[%s]]></category>', $tag );
foreach ( $post['comments'] as $comment )
$item[] = sprintf(
'<wp:comment>
<wp:comment_author><![CDATA[%s]]></wp:comment_author>
<wp:comment_author_url>%s</wp:comment_author_url>
<wp:comment_date>%s</wp:comment_date>
<wp:comment_content><![CDATA[%s]]></wp:comment_content>
</wp:comment>',
$comment['comment_author'],
$comment['comment_author_url'],
$comment['comment_date'],
$comment['comment_content']
);
$items[] = sprintf( '<item>%s</item>', implode( "\n", $item ) );
*/
}
file_put_contents( 'export.json', json_encode( $posts ) );
echo 'DONE';
/*
// RSS export
file_put_contents(
'export.xml',
sprintf(
'<?xml version="1.0" encoding="UTF-8"?>
<rss>
<wp:wxr_version>1.2</wp:wxr_version>
<channel>%s</channel>
</rss>',
implode( "\n", $items )
)
);
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment