Last active
July 8, 2021 15:47
-
-
Save joshlewis/f22350235a7619e9d70b to your computer and use it in GitHub Desktop.
What kind of clickbait numbers is BuzzFeed using in their titles? Let's see some stats, based on their RSS feeds.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
//What kind of clickbait numbers is BuzzFeed using in their titles these days? | |
$feeds = [ | |
"animals", | |
//"celeb", | |
"tvandmovies", | |
"food", | |
"fwd", | |
"lgbt", | |
"music", | |
"politics", | |
"rewind", | |
"sports", | |
"lol", | |
"win", | |
"omg", | |
"cute", | |
"geeky", | |
"trashy", | |
"fail", | |
"wtf", | |
"australia", | |
"books", | |
"brasil", | |
"comics", | |
"diy", | |
"longform", | |
"tech", | |
"travel", | |
"quiz", | |
"world", | |
"health", | |
// "espanol", | |
// "france", | |
// "ideas", //Obsoleted? | |
// "usnews", //Obsoleted? | |
// "uknews", //Obsoleted? | |
// "aunews", //Obsoleted? | |
// "community",//Obsoleted? | |
]; | |
$titles = []; | |
//Get the titles | |
foreach ($feeds as $feed) { | |
echo "\rGetting $feed feed... "; | |
$feedUrl = 'http://www.buzzfeed.com/'.$feed.'.xml'; | |
$rawXmlString = @file_get_contents($feedUrl); | |
if ($rawXmlString !== FALSE) { | |
$contentObj = simplexml_load_string($rawXmlString); | |
if ($contentObj !== FALSE) { | |
$result = $contentObj->xpath('/rss/channel/item/title'); | |
//$newResults = array_map(function($t){ return html_entity_decode($t);}, array_values($result)); | |
$titles = array_merge($titles, array_values($result)); | |
} | |
} else { | |
echo "\nSomething is wrong with this feed. Look into it: ".$feedUrl."\n"; | |
} | |
} | |
//Analyze the titles for the presence of clickbait numbers | |
foreach ($titles as $title) { | |
//Clickbait numbers are almost always at the very beginning of the title. That's what we look for. | |
//However, we allow titles that begin with certain words followed by numbers to count | |
//Les is French, Os and As are Portuguese, Los is Spanish | |
if (preg_match('/^(Top|The|Our|Les|Os|As|Los)?\s*(\d{1,3})\s+/', $title, $matches) === 1) { | |
$titlesWithNumbers[] = $title; | |
$clickbaitNumbers[] = $matches[2]; | |
} else { | |
$titlesWithoutNumbers[] = $title; | |
} | |
} | |
//Find the special numbers | |
$modeValues = array_count_values($clickbaitNumbers); | |
$mode = array_search(max($modeValues), $modeValues); | |
$roundedMean = number_format(array_sum($clickbaitNumbers) / count($clickbaitNumbers)); | |
//Stolen from http://www.mdj.us/web-development/php-programming/calculating-the-median-average-values-of-an-array-with-php/ | |
sort($clickbaitNumbers); | |
$count = count($clickbaitNumbers); //total numbers in array | |
$middleval = floor(($count-1)/2); // find the middle value, or the lowest middle value | |
if($count % 2) { // odd number, middle is the median | |
$median = $clickbaitNumbers[$middleval]; | |
} else { // even number, calculate avg of 2 medians | |
$low = $clickbaitNumbers[$middleval]; | |
$high = $clickbaitNumbers[$middleval+1]; | |
$median = (($low+$high)/2); | |
} | |
$totalArticleCount = count($titlesWithNumbers) + count($titlesWithoutNumbers); | |
echo "\rAbout ".number_format((count($titlesWithNumbers) / $totalArticleCount)*100)."% of the ".$totalArticleCount." BuzzFeed articles we found use clickbait-style numbers.\n\n"; | |
echo "The number $mode occurred the most often.\n"; | |
echo "$roundedMean was the average of the integers used.\n"; | |
echo "$median was the median of all the clickbait numbers.\n"; | |
$sampleCount = 10; | |
shuffle($titlesWithNumbers); | |
shuffle($titlesWithoutNumbers); | |
echo "\nA random sampling of $sampleCount titles with clickbait numbers:\n=====\n".implode("\n", array_slice($titlesWithNumbers, 0, $sampleCount))."\n\n"; | |
echo "A random sampling of $sampleCount titles without clickbait numbers:\n=====\n".implode("\n", array_slice($titlesWithoutNumbers, 0, $sampleCount))."\n\n"; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
As of today, this is still working! 👍