Skip to content

Instantly share code, notes, and snippets.

@marktriggs
Created September 21, 2010 23:47
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save marktriggs/590822 to your computer and use it in GitHub Desktop.
Save marktriggs/590822 to your computer and use it in GitHub Desktop.
/* Not the nicest code we ever wrote... */
<?php
require_once 'HTTP/Request.php';
require_once 'Text/Wiki/Mediawiki.php';
class Wikipedia
{
private function zapBetween ($string, $start, $end)
{
$nesting = 0;
$result = "";
$index = 0;
$len = strlen ($string);
while ($index < $len) {
if (strpos ($string, $start, $index) === $index) {
$nesting++;
$index += strlen ($start);
} else if (strpos ($string, $end, $index) === $index) {
$nesting--;
$index += strlen ($end);
} else {
if ($nesting == 0) {
$result = $result . substr ($string, $index, 1);
}
$index++;
}
}
return $result;
}
private function extractImage ($wikitext)
{
$image = array ();
preg_match ("/\n\s?\|\s?image\s?=\s? (.*?)\n/", $wikitext, $image);
$image = $image[1];
if ($image) {
$imgurl = "http://en.wikipedia.org/w/api.php" .
'?prop=imageinfo&action=query&iiprop=url&iiurlwidth=150&format=php' .
'&titles=Image:' . str_replace (' ', '_', $image);
$client = new HTTP_Request ();
$client->setMethod (HTTP_REQUEST_METHOD_GET);
$client->setURL ($imgurl);
$response = $client->sendRequest ();
if ($response = $client->getResponseBody ()) {
if ($imageinfo = unserialize ($response)) {
// Hack for wikipedia api
preg_match ('/\"http:\/\/(.*)\"/', $response, $matches);
$image = 'http://' . substr ($matches[1],
0,
strpos ($matches[1], '"'));
return $image;
}
}
}
return false;
}
private function cleanWikiText ($wikitext)
{
$wikitext = $this->zapBetween ($wikitext, "{{", "}}");
$wikitext = preg_replace ("/\[\[Image:.*?\]\]\n/", "", $wikitext);
if (!preg_match ('/disambiguation/', $result['name'])) {
$wikitext = preg_replace ('/\=\=.*?$/s', '', $wikitext);
}
return $wikitext;
}
private function wikipediaBio ($lookfor, $author)
{
$wiki =& new Text_Wiki_Mediawiki ();
$wiki->setRenderConf ('xhtml', 'wikilink', 'pages', false);
$wiki->setFormatConf ('Xhtml', 'charset', 'utf-8');
$wiki->setFormatConf ('Xhtml', 'translate', false);
$wiki->setRenderConf ('xhtml', 'image', 'base',
'http://en.wikipedia.org/wiki/Image:');
$wiki->setRenderConf ('xhtml', 'wikilink', 'view_url',
'http://en.wikipedia.org/wiki/%s');
$wiki->disableRule ('image');
$url = 'http://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=php&redirects=1&titles=' . $lookfor;
$requestParams = array ('timeout' => 2, 'readTimeout' => array ('4','0'));
$client = new HTTP_Request ('', $requestParams);
$client->setMethod (HTTP_REQUEST_METHOD_GET);
$client->setURL ($url);
PEAR::setErrorHandling (PEAR_ERROR_RETURN);
$response = $client->sendRequest ();
if (!PEAR::isError ($response)) {
$result = array();
$body = @unserialize ($client->getResponseBody ());
//Check if data exists or not
if ($body &&
!isset($body['error']) &&
!isset($body['query']['pages']['-1'])) {
$body = array_shift ($body['query']['pages']);
$result['name'] = $body['title'];
$body = array_shift ($body['revisions']);
$wikitext = $body['*'];
$image_url = $this->extractImage ($wikitext);
if ($image_url) {
$result['image'] = $image_url;
}
$wikitext = $this->cleanWikiText ($wikitext);
$result['description'] = $wiki->transform ($wikitext, 'xhtml');
/* If we have life dates, extract them now... */
$year = false;
if (preg_match ("/[0-9]{4}/", array_pop ($author),
$matches)) {
if (count ($matches) > 0) {
$year = $matches[0];
}
}
$result['provider'] = "Wikipedia";
$result['provider_link'] = "http://www.wikipedia.org/wiki/" . $result['name'];
if (preg_match ('/disambiguation/', $result['name']) ||
preg_match ('/^[^\.]+(author|writer|journalist|novelist)[^\.]*\./',
$wikitext) ||
($year && preg_match ("/$year/", $wikitext))) {
return $result;
}
return false;
}
}
return false;
}
private function findBiography ($name)
{
global $configArray;
$author = preg_replace ("/,$/", "", $name);
/* Remove any parenthetical remarks from the author's name */
$author = $this->zapBetween ($author, "(", ")");
$author = explode (',', $author);
// Look for a disambiguation page
$wikiInfo = $this->wikipediaBio (urlencode ("$author[1] $author[0] (disambiguation)"),
$author);
if ($wikiInfo) {
return false;
}
// Look for an author page
$wikiInfo = $this->wikipediaBio (urlencode ("$author[1] $author[0]"),
$author);
if ($wikiInfo) {
return $wikiInfo;
}
// Give up!
return false;
}
function biographyFor ($name)
{
$result = $this->findBiography ($name);
if ($result) {
$result['snippet'] = (preg_replace ("/<\/p>.*/s", "</p>",
$result['description']));
$result['hasmore'] =
!preg_match ('/^\s+$/',
substr ($result['description'],
strlen ($result['snippet'])));
}
return $result;
}
}
?>
@marktriggs
Copy link
Author

Using it...

php> require 'Wikipedia.php'
php> $wikipedia = new Wikipedia();

This works because Miles Franklin has the word "writer" in the first sentence.

php> = $wikipedia->biographyFor ("Franklin, Miles");
Array
(
[name] => Miles Franklin
[image] => http://upload.wikimedia.org/wikipedia/commons/thumb/4/4c/Miles_franklin.jpg/150px-Miles_franklin.jpg
[description] => <p><strong>Miles Franklin</strong> (born "Stella Maria Sarah Miles Franklin"; 14 October 1879 â 19 September 1954) was an Australian writer and feminist who is best known for her autobiographical novel, <em><a class="" href="http://en.wikipedia.org/wiki/My_Brilliant_Career#">My Brilliant Career</a></em>, published in 1901. While she wrote throughout her life, her other major literary success, <em>All That Swagger</em>, was not published until 1936.</p>

<p>She was committed to the development of a uniquely Australian form of literature, and she actively pursued this goal by supporting writers, literary journals, and writers' organisations. She has had a long-lasting impact on Australian literary life through her endowment of a major literary award known as the <a class="" href="http://en.wikipedia.org/wiki/Miles_Franklin_Award#">Miles Franklin Award</a>.</p>


[provider] => Wikipedia
[provider_link] => http://www.wikipedia.org/wiki/Miles Franklin
[snippet] => <p><strong>Miles Franklin</strong> (born "Stella Maria Sarah Miles Franklin"; 14 October 1879 â 19 September 1954) was an Australian writer and feminist who is best known for her autobiographical novel, <em><a class="" href="http://en.wikipedia.org/wiki/My_Brilliant_Career#">My Brilliant Career</a></em>, published in 1901. While she wrote throughout her life, her other major literary success, <em>All That Swagger</em>, was not published until 1936.</p>
[hasmore] => 1
)

Works because we match on "1938"
php> = $wikipedia->biographyFor ("Knuth, Donald Ervin, 1938-");
Array
(
[name] => Donald Knuth
[image] => http://upload.wikimedia.org/wikipedia/commons/thumb/4/4f/KnuthAtOpenContentAlliance.jpg/150px-KnuthAtOpenContentAlliance.jpg
[description] =>

Donald Ervin Knuth ("Frequently Asked Questions" at Stanford site. Gives the pronunciation of his name as "Ka-NOOTH".) (born January 10, 1938) is a computer scientist and Professor Emeritus of the Art of Computer Programming at Stanford University.Donald Knuth's Homepage at Stanford.

<p>Author of the seminal multi-volume work <em><a class="" href="http://en.wikipedia.org/wiki/The_Art_of_Computer_Programming#">The Art of Computer Programming</a></em> ("TAOCP"),<ref><a href="http://www-cs-faculty.stanford.edu/~uno/taocp.html" onclick="window.open(this.href, '_blank'); return false;">The Art of Computer Programming</a> (Stanford University).</ref> Knuth has been called the "father" of the <a class="" href="http://en.wikipedia.org/wiki/analysis_of_algorithms#">analysis of algorithms</a>, contributing to the development of, and systematizing formal mathematical techniques for, the rigorous analysis of the computational complexity of algorithms, and in the process popularizing <a class="" href="http://en.wikipedia.org/wiki/Big_O_notation#">asymptotic notation</a>.</p>

<p>In addition to fundamental contributions in several branches of <a class="" href="http://en.wikipedia.org/wiki/theoretical_computer_science#">theoretical computer science</a>, Knuth is the creator of the <a class="" href="http://en.wikipedia.org/wiki/TeX#">TeX</a> computer typesetting system, the related <a class="" href="http://en.wikipedia.org/wiki/METAFONT#">METAFONT</a> font definition language and rendering system, and the <a class="" href="http://en.wikipedia.org/wiki/Computer_Modern#">Computer Modern</a> family of typefaces.</p>

<p>A writer and scholar,<ref><a href="http://www-cs-faculty.stanford.edu/~knuth/vita.html" onclick="window.open(this.href, '_blank'); return false;">Knuth's CV</a></ref> Knuth created the <a class="" href="http://en.wikipedia.org/wiki/WEB#">WEB</a>/<a class="" href="http://en.wikipedia.org/wiki/CWEB#">CWEB</a> computer programming systems designed to encourage and facilitate <a class="" href="http://en.wikipedia.org/wiki/literate_programming#">literate programming</a>, and designed the <a class="" href="http://en.wikipedia.org/wiki/MMIX#">MMIX</a> <a class="" href="http://en.wikipedia.org/wiki/instruction_set_architecture#">instruction set architecture</a>.</p>


[provider] => Wikipedia
[provider_link] => http://www.wikipedia.org/wiki/Donald Knuth
[snippet] => <p><strong>Donald Ervin Knuth</strong> (<ref name="FAQ"><a href="http://www-cs-faculty.stanford.edu/~knuth/faq.html" onclick="window.open(this.href, '_blank'); return false;">&quot;Frequently Asked Questions&quot; at Stanford site</a>. Gives the pronunciation of his name as "Ka-NOOTH".</ref>) (born January 10, 1938) is a <a class="" href="http://en.wikipedia.org/wiki/computer_science#">computer scientist</a> and <a class="" href="http://en.wikipedia.org/wiki/Emeritus#">Professor Emeritus</a> of the Art of Computer Programming at <a class="" href="http://en.wikipedia.org/wiki/Stanford_University#">Stanford University</a>.<ref><a href="http://www-cs-faculty.stanford.edu/~knuth/" onclick="window.open(this.href, '_blank'); return false;">Donald Knuth's Homepage at Stanford</a>.</ref></p>
[hasmore] => 1
)

Works because of the word "author" in the first sentence.
php> = $wikipedia->biographyFor ("Thiele, Colin");
Array
(
[name] => Colin Thiele
[description] =>

Colin Milton Thiele, AC (16 November 1920 â 5 September 2006) was an Australian author and educator. He was renowned for his award-winning children's fiction, most notably the novels Storm Boy and Blue Fin.

[provider] => Wikipedia
[provider_link] => http://www.wikipedia.org/wiki/Colin Thiele
[snippet] => <p><strong>Colin Milton Thiele</strong>, <a class="" href="http://en.wikipedia.org/wiki/Order_of_Australia#">AC</a> (16 November 1920 â 5 September 2006) was an Australian author and educator. He was renowned for his award-winning children's fiction, most notably the novels <em><a class="" href="http://en.wikipedia.org/wiki/Storm_Boy_%28novel%29#">Storm Boy</a></em> and <em><a class="" href="http://en.wikipedia.org/wiki/Blue_Fin#">Blue Fin</a></em>.</p>
[hasmore] =>
)

Gives no match--no life dates or author-ish words...
php> = $wikipedia->biographyFor ("Hitchcock, Alfred");
php>

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment