Skip to content

Instantly share code, notes, and snippets.

@timdp
Last active December 11, 2015 06:59
Show Gist options
  • Save timdp/4563560 to your computer and use it in GitHub Desktop.
Save timdp/4563560 to your computer and use it in GitHub Desktop.
Twitter Archive Analyzer
<?php
// Twitter Archive Analyzer
// @tmdpw, 2013-01-18
// https://github.com/timdp
// Instructions:
// 1. Request your Twitter archive from twitter.com.
// 2. Extract the zip file you obtained.
// 3. Run this script with the extraction path as its argument.
error_reporting(E_ALL ^ E_NOTICE);
if (count($argv) != 2) {
die("Usage: php $argv[0] PATH");
}
$base_dir = $argv[1];
$mentions_freq = array();
$hashtag_freq = array();
$word_freq = array();
$files = scandir($base_dir);
foreach ($files as $index => $file) {
if (!preg_match('/^([0-9]{4})_([0-9]{2})\.js$/', $file, $matches)) {
continue;
}
printf("[%" . len(count($files)) . "d/%d] Loading %s ..." . PHP_EOL,
$index + 1, count($files), $file);
$year = $matches[1];
$month = @intval($matches[2]);
$contents = file_get_contents("$base_dir/$file");
$contents = preg_replace('/^.*?=\s*/s', '', $contents);
$data = json_decode($contents);
foreach ($data as $tweet) {
foreach ($tweet->entities->hashtags as $ht) {
$hashtag_freq[lower($ht->text)]++;
}
foreach ($tweet->entities->user_mentions as $m) {
$mentions_freq[lower($m->screen_name)]++;
}
preg_match_all('/\b(\S+)\b/', $tweet->text, $matches);
foreach ($matches[1] as $word) {
$word = lower($word);
if (!array_key_exists($word, $hashtag_freq)
&& !array_key_exists($word, $mentions_freq)) {
$word_freq[$word]++;
}
}
}
}
echo PHP_EOL;
print_top('Hashtags', $hashtag_freq, 10);
print_top('Mentions', $mentions_freq, 10);
print_top('Words', $word_freq, 10);
function print_top($what, &$data, $max) {
arsort($data, SORT_NUMERIC);
array_splice($data, $max);
print_title($what, 78);
$i = 0;
foreach ($data as $item => $count) {
printf('%5s. %-60s %10s' . PHP_EOL,
number_format(++$i), $item, number_format($count));
}
echo PHP_EOL;
}
function print_title($title, $width) {
$line = str_repeat('=', $width);
echo $line, PHP_EOL;
printf('%' . ($width - len($title)) / 2 . 's', '');
echo $title, PHP_EOL, $line, PHP_EOL, PHP_EOL;
}
function lower($str) {
return extension_loaded('mbstring')
? mb_strtolower($str, 'UTF-8')
: strtolower($str);
}
function len($str) {
return extension_loaded('mbstring')
? mb_strlen($str, 'UTF-8')
: strlen($str);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment