Skip to content

Instantly share code, notes, and snippets.

@jwcounts
Last active October 22, 2018 16:12
Show Gist options
  • Save jwcounts/ebf46df324e3e4704ab60dc01b86eb32 to your computer and use it in GitHub Desktop.
Save jwcounts/ebf46df324e3e4704ab60dc01b86eb32 to your computer and use it in GitHub Desktop.
Update to Twitter Analytics Scraper
<?php
// You can set a custom user agent if you like, this one is for Firefox 60
$user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:60.0) Gecko/20100101 Firefox/60.0';
// Username and password
$username = "";
$password = "";
// Set the start and end time, and add 3 zeroes, since Twitter measures in microseconds
$start = mktime( 0, 0, 0, 6, 19, 2018 ).'000';
$end = mktime( 0, 0, 0, 6, 24, 2018 ).'000';
// Pull in the cookie file to see if authentication tokens already exist
$tw_cookie = "./cookie.txt";
$cookie = file_get_contents( $tw_cookie );
// Set up our basic cURL options
$ch = curl_init();
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_USERAGENT, $user_agent);
curl_setopt($ch, CURLOPT_COOKIEFILE, $tw_cookie);
curl_setopt($ch, CURLOPT_COOKIEJAR, $tw_cookie);
curl_setopt($ch, CURLOPT_REFERER, "https://twitter.com/");
// Check if the auth_token cookie exists. If not, log in and grab one
if ( !preg_match( '/auth_token\t([a-z0-9]+)/', $cookie ) ) :
// First call gets hidden form field authenticity_token and session cookie
curl_setopt($ch, CURLOPT_URL, "https://twitter.com/");
$html = curl_exec($ch);
// parse authenticity_token out of html response
preg_match('/formAuthenticityToken\&quot\;\:&quot\;([0-9a-zA-Z]+)\&quot\;/', $html, $match);
$authenticity_token = $match[1];
// set post data
$sPost = "session[username_or_email]=$username&session[password]=$password&return_to_ssl=true&scribe_log=&redirect_after_login=%2F&authenticity_token=$authenticity_token";
// second call is a post and performs login
curl_setopt($ch, CURLOPT_URL, "https://twitter.com/sessions");
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_POSTFIELDS, $sPost);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, array("Content-type: application/x-www-form-urlencoded"));
curl_exec($ch);
endif;
// Pull in the cookie file again, and remove references to '#HttpOnly_'
// If you don't, subsequent runs will ignore the auth_token because it will be parsed as a comment
$cookie = file_get_contents( $tw_cookie );
$cookie_strip = str_replace( '#HttpOnly_', '', $cookie );
file_put_contents( $tw_cookie, $cookie_strip );
// Do a post request to generate a CSV of the tweets from our time period
$sTarget = "https://analytics.twitter.com/user/{$username}/tweets/export.json?start_time={$start}&end_time={$end}&lang=en";
$sTargetBundle = "https://analytics.twitter.com/user/$username/tweets/bundle?start_time={$start}&end_time={$end}&lang=en";
curl_setopt($ch, CURLOPT_URL, $sTarget);
curl_setopt($ch, CURLOPT_POST, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, array("Content-type: application/x-www-form-urlencoded"));
curl_setopt($ch, CURLOPT_POSTFIELDS, '');
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_HEADER, false);
$data_arr = ["status" => "Pending"];
// Since the CSV takes some time to generate, check the status a few times
$attempts = 0;
while ( $data_arr['status'] == "Pending" && $attempts < 5 ) :
$data = curl_exec($ch);
$data_arr = json_decode($data, true);
sleep(2);
endwhile;
// Once it is completed, download the CSV file and save it
curl_setopt($ch, CURLOPT_POST, false);
curl_setopt($ch, CURLOPT_URL, $sTargetBundle);
$data = curl_exec($ch);
// display server response
$error = curl_error($ch);
$destination = "./tweets.csv";
$file = fopen($destination, "w+"); // overwrite the previous file
fputs($file, $data);
fclose($file);
// Download the graph data JSON
$sTarget = "https://analytics.twitter.com/user/{$username}/tweets/account_stats.json?start_time={$start}&end_time={$end}";
curl_setopt($ch, CURLOPT_URL, $sTarget);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HEADER, false);
// Save the JSON file
$data = curl_exec ($ch);
$error = curl_error($ch);
$destination = "./graphs.json";
$file = fopen($destination, "w+"); // overwrite the previous file
fputs($file, $data);
fclose($file);
// Data for SVG timeline
$sTarget = "https://analytics.twitter.com/user/{$username}/tweets/timeline.json?start_time={$start}&max_id=0&end_time={$end}&page=0&filter=no_replies&metric=clicks&lang=en";
curl_setopt($ch, CURLOPT_URL, $sTarget);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, false);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
// Save the timeline data
$data = curl_exec ($ch);
$error = curl_error($ch);
$destination = "./timeline.json";
$file = fopen($destination, "w+"); // overwrite the previous file
fputs($file, $data);
fclose($file);
curl_close ($ch);
?>
@jwcounts
Copy link
Author

Revised the code to properly save all of the cookies to a file so that subsequent requests don't have to log in again.

I had into an issue where I ran the test script too many times in too short a period and locked my account out temporarily. This should help with that.

@jwcounts
Copy link
Author

Leaving this here because it's an interesting experiment, but be ye warned: it's also a really good way to lock up your account. Apparently this type of automated behavior is against the rules. Granted, I or anyone else wouldn't have to resort to this if Twitter just had an Analytics API that wasn't only about ads or stupid expensive.

@amikehere
Copy link

Thanks for updating it! Warning understood, do you have a guess on how often was it running that resulted in a lock up?

@jwcounts
Copy link
Author

Thanks for updating it! Warning understood, do you have a guess on how often was it running that resulted in a lock up?

Sorry I missed your question for so long! If you're still interested, I initially ran into problems while testing, since I would tweak the script and test it several times over the course of an hour or so. I then tried to space out runs of the script every couple of hours, but that started backfiring too. Pretty soon, any run of the script, no matter how long I waited, would lock the account, even with the modifications I had made.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment