Skip to content

Instantly share code, notes, and snippets.

@aaronpk
Created April 2, 2011 20:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aaronpk/899827 to your computer and use it in GitHub Desktop.
Save aaronpk/899827 to your computer and use it in GitHub Desktop.
Retrieve the first significant sentence from a wikipedia article
<?php
$id = $_GET['id'];
function getSignificantSentence($id) {
$url = 'http://en.wikipedia.org/w/index.php?curid=' . $id;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($ch, CURLOPT_USERAGENT, 'Geoloqi (Geo-coded Wikipedia Article Layer) http://geoloqi.com');
$html = curl_exec($ch);
// Article text is between "bodytext" comment tags
if(preg_match('/<!-- bodytext -->(.+)<!-- \/bodytext -->/s', $html, $match)) {
$article = $match[1];
// Remove tables since they don't contain explanatory text
$article = preg_replace('#<table[^>]*>.+?</table>#is', '', $article);
// Remove thumbnail pictures
$article = preg_replace('#<div class="thumb[^>]+>.+?</div>\s*</div>\s*</div>#s', '', $article);
// Strip all remaining HTML tags to get plain text
$article = strip_tags($article);
// Split the text into sentences
$sentences = preg_split('/(?<!(mr|ms|dr|mt|st))\.\s*/i', $article);
// Look for the first sentence not in the format (* is * in *)
$sentence = FALSE;
foreach($sentences as $s) {
$s = trim($s);
if(preg_match('/.+ is .+ in .+/', $s) == FALSE) {
$sentence = $s;
break;
}
}
return $sentence;
}
else {
return FALSE;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment