Skip to content

Instantly share code, notes, and snippets.

@cimi
Created April 11, 2013 14:02
Show Gist options
  • Save cimi/5363621 to your computer and use it in GitHub Desktop.
Save cimi/5363621 to your computer and use it in GitHub Desktop.
Automatically scrape a website and create wordpress posts from its machines.
<pre>
<?php
define('WP_IMPORT_ADMIN', TRUE);
define('TAXONOMY', 'portfolio_category');
require_once( dirname(dirname(__FILE__)) . '/admin.php');
function addThumbnail($url, $name, $post_id) {
$filename = "../../wp-content/uploads/thumbnails/" . strtolower(preg_replace('/(\s\s+)|(\s*[^a-zA-Z0-9]+\s*)/', '-', $name)) . ".jpg";
file_put_contents($filename, file_get_contents($url));
$wp_filetype = wp_check_filetype($filename, null );
$mime_type = $wp_filetype[type];
$attachment = array(
'post_mime_type' => $wp_filetype['type'],
'post_title' => preg_replace('/\.[^.]+$/', '', basename($filename)),
'post_name' => preg_replace('/\.[^.]+$/', '', basename($filename)),
'post_content' => '',
'post_parent' => $post_id,
'post_status' => 'inherit'
);
$attachment_id = wp_insert_attachment($attachment, $filename, $post_id);
if ($attachment_id != 0) {
$attachment_data = wp_generate_attachment_metadata($attachment_id, $filename);
wp_update_attachment_metadata($attachment_id, $attach_data);
update_post_meta($post_id, '_thumbnail_id', $attachment_id);
}
}
function addMachine($machine) {
$postData = array(
'post_title' => $machine->name,
'post_content' => $machine->description,
'post_status' => 'publish',
'post_type' => 'portfolio'
);
$page = get_page_by_title($machine->name, OBJECT, 'portfolio');
if ($page && $page->ID > 0) {
echo "Machine $machine->name already imported.\n";
} else {
$post_id = wp_insert_post($postData);
$resp = wp_set_post_terms($post_id, $machine->category, TAXONOMY);
addThumbnail($machine->image, $machine->name, $post_id);
}
}
$machines = json_decode(file_get_contents('machine_list.txt'));
foreach ($machines as $machine) {
echo "Processing $machine->name with thumbnail $machine->image\n";
addMachine($machine);
}
?>
</pre>
#!/usr/bin/env ruby
require 'json'
require 'open-uri'
require 'Nokogiri'
PREFIX = 'http://www.biesseamerica.com/'
doc = Nokogiri::HTML(open(PREFIX + "default.asp?biesse=178&urlkeyword=products/"))
categories = {}
doc.css('#content #column7 ul li h4 a').each do |node|
categories[node['href']] = node.content
end
machines = {}
categories.each do |link, name|
print "Opening: " + PREFIX + link + "\n"
category_doc = Nokogiri::HTML(open(PREFIX + link))
category_doc.css('.product_list a').each do |node|
(machines[name] ||= []) << node['href']
end
end
categories = machines
machines = []
categories.each do |category, pages|
pages.each do |link|
url = PREFIX + link
print "Opening machine page: " + url + "\n"
machine_doc = Nokogiri::HTML(open(url))
name = machine_doc.css('.header h1').first
image = machine_doc.css('.product_image img').first
paras = machine_doc.css('#content #column2 .overview p')
description = ""
paras.each do |paragraph|
description += paragraph.content + "\n"
end
machines << {
"name" => name.content,
"category" => category,
"image" => PREFIX + image['src'],
"description" => description.rstrip,
"url" => url
}
end
end
File.open('machine_list.txt', 'w') { |file| file.write(JSON.dump machines) }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment