phette23/doaj.php

## readme.md

      
    Raw
  

              readme.md
            
          
    These are a few of examples of web scraping for a blog post.
Most will require installing a package using the language's package manager. The gem, cpan, and easy_install commands should be readily available for those on Mac or Linux machines. For Windows users, you may be better off searching for instructions online or finding a way to download and import the package manually.
If one of the packages fails to install, chances are you need root privileges. Try re-running the command prefixed with sudo and enter your password at the prompt, e.g. sudo gem install nokogiri.

  
## doaj.php
<?php
// need to download simple_html_dom.php from SourceForge
// http://sourceforge.net/projects/simplehtmldom/files/
// and place it in the same directory as this script
require_once( 'simple_html_dom.php' );
// http://simplehtmldom.sourceforge.net/manual_api.htm

$base = 'http://www.doaj.org/doaj?func=search&template=&uiLanguage=en&query=';
$query = urlencode( 'librarianship' );

$html = file_get_html( $base . $query );

$records = $html->find( '.record .data, .recordColored .data' );

foreach( $records as $record ) {
  echo $record->getElementsByTagName( 'b', 0 )->plaintext . PHP_EOL;
}

## doaj.pl
# requires installing WWW::Mechanize, Mojo::DOM, & WWW:Mechanize::Query
# you can install from the command line with cpan
# cpan WWW::Mechanize Mojo::DOM WWW::Mechanize::Query
use WWW::Mechanize::Query;
# get rid of "wide character in print" warnings
no warnings 'utf8';

my $mech = WWW::Mechanize::Query->new();
my $base = 'http://www.doaj.org/doaj?func=search&template=&uiLanguage=en&query=';
my $query = 'librarianship';

$mech->get( $base . $query );

my $titles = $mech->find( '.record .data > b, .recordColored .data > b, .record .data > a > b' );

print( $titles->text() );
print "\n";

## doaj.py
# "Beautiful Soup" package: http://www.crummy.com/software/BeautifulSoup/
# this will require installing both BeautifulSoup & lxml
# the command `easy_install BeautifulSoup lxml` should do the trick
from bs4 import BeautifulSoup
import urllib

base = 'http://www.doaj.org/doaj?func=search&template=&uiLanguage=en&query='
query = urllib.quote_plus( 'librarianship' )

# BeautifulSoup can use different HTML parsers & by default uses Python's
# built-in one. Unfortunately, that parser chokes on the DOAJ page for some
# reason, thus the need for lxml which is specified as the 2nd parameter here.
html = BeautifulSoup( urllib.urlopen( base + query ), 'lxml' )

records = html.findAll( class_='record') + html.findAll( class_='recordColored' )

for record in records:
    print record.find( class_='data' ).find( 'b' ).get_text()

## doaj.rb
# you will need to `gem install nokogiri`
# on the command line first for this to work
require 'nokogiri'
require 'open-uri'
# http://nokogiri.org/tutorials/parsing_an_html_xml_document.html

base = 'http://www.doaj.org/doaj?func=search&template=&uiLanguage=en&query='
query = URI.escape( 'librarianship' )

dom = Nokogiri::HTML( open( base + query ) )

dom.css( '.record .data > b, .recordColored .data > b, .record .data > a > b' ).each do |title|
  puts title.content
end
	<?php
	// need to download simple_html_dom.php from SourceForge
	// http://sourceforge.net/projects/simplehtmldom/files/
	// and place it in the same directory as this script
	require_once( 'simple_html_dom.php' );
	// http://simplehtmldom.sourceforge.net/manual_api.htm

	$base = 'http://www.doaj.org/doaj?func=search&template=&uiLanguage=en&query=';
	$query = urlencode( 'librarianship' );

	$html = file_get_html( $base . $query );

	$records = $html->find( '.record .data, .recordColored .data' );

	foreach( $records as $record ) {
	echo $record->getElementsByTagName( 'b', 0 )->plaintext . PHP_EOL;
	}
	# requires installing WWW::Mechanize, Mojo::DOM, & WWW:Mechanize::Query
	# you can install from the command line with cpan
	# cpan WWW::Mechanize Mojo::DOM WWW::Mechanize::Query
	use WWW::Mechanize::Query;
	# get rid of "wide character in print" warnings
	no warnings 'utf8';

	my $mech = WWW::Mechanize::Query->new();
	my $base = 'http://www.doaj.org/doaj?func=search&template=&uiLanguage=en&query=';
	my $query = 'librarianship';

	$mech->get( $base . $query );

	my $titles = $mech->find( '.record .data > b, .recordColored .data > b, .record .data > a > b' );

	print( $titles->text() );
	print "\n";
	# "Beautiful Soup" package: http://www.crummy.com/software/BeautifulSoup/
	# this will require installing both BeautifulSoup & lxml
	# the command `easy_install BeautifulSoup lxml` should do the trick
	from bs4 import BeautifulSoup
	import urllib

	base = 'http://www.doaj.org/doaj?func=search&template=&uiLanguage=en&query='
	query = urllib.quote_plus( 'librarianship' )

	# BeautifulSoup can use different HTML parsers & by default uses Python's
	# built-in one. Unfortunately, that parser chokes on the DOAJ page for some
	# reason, thus the need for lxml which is specified as the 2nd parameter here.
	html = BeautifulSoup( urllib.urlopen( base + query ), 'lxml' )

	records = html.findAll( class_='record') + html.findAll( class_='recordColored' )

	for record in records:
	print record.find( class_='data' ).find( 'b' ).get_text()
	# you will need to `gem install nokogiri`
	# on the command line first for this to work
	require 'nokogiri'
	require 'open-uri'
	# http://nokogiri.org/tutorials/parsing_an_html_xml_document.html

	base = 'http://www.doaj.org/doaj?func=search&template=&uiLanguage=en&query='
	query = URI.escape( 'librarianship' )

	dom = Nokogiri::HTML( open( base + query ) )

	dom.css( '.record .data > b, .recordColored .data > b, .record .data > a > b' ).each do \|title\|
	puts title.content
	end