Skip to content

Instantly share code, notes, and snippets.

@pjaspers
Last active October 25, 2023 21:13
Show Gist options
  • Save pjaspers/e946e3dfd8915ed916c658d6c221800b to your computer and use it in GitHub Desktop.
Save pjaspers/e946e3dfd8915ed916c658d6c221800b to your computer and use it in GitHub Desktop.
If you have your twitter archive laying around, this will put it in a sqlite. Save it somewhere and run it with `ruby twitter-archive.rb` and it should just work (if you did a gem install sqlite3 first)
gem "sqlite3", "~> 1.6.7"
require "sqlite3"
require "json"
require "time"
archive_directory = "~/Documents/Archives/twitter-2023-07-01-826d9daac5cf75f140476a88ff1e97b0f4ba6b8bba8b47c62f8b8c1e65f7aee0"
output_db_path = "~/Sync/twitter.db"
begin
lines = File.readlines(File.expand_path(File.join(archive_directory, "data/tweets.js")), chomp: true); nil
rescue Errno::ENOENT
abort <<~STR
Nope, nope, nope
Couldn't find a data/tweets.js in
#{File.expand_path(archive_directory)}"
STR
end
lines[0] = lines[0].split(" = ").last # remove twitters weird window thing and make it json
parsed = JSON.parse(lines.join("\n")); nil
# json.first["tweet"].keys has the columns
#=> ["edit_info", "retweeted", "source", "entities", "display_text_range", "favorite_count", "in_reply_to_status_id_str", "id_str", "in_reply_to_user_id", "truncated", "retweet_count", "id", "in_reply_to_status_id", "created_at", "favorited", "full_text", "lang", "in_reply_to_screen_name", "in_reply_to_user_id_str"]
db = SQLite3::Database.new(File.expand_path(output_db_path))
puts "Setting up DB at #{output_db_path}..."
db.execute_batch(<<~SQL)
CREATE TABLE if not exists tweets (
id varchar(500) primary key,
in_reply_to_status_id varchar(100),
in_reply_to_user_id varchar(100),
truncated boolean,
retweet_count integer,
created_at datetime,
favorited boolean,
text text,
lang varchar(20),
in_reply_to_screen_name varchar(200),
source text
);
SQL
db.execute_batch(<<~SQL)
CREATE VIRTUAL TABLE if not exists "tweets_fts" USING FTS5 (
id,
text,
created_at,
content="tweets"
);
SQL
insert = db.prepare(<<~SQL)
insert or ignore into tweets(id, in_reply_to_status_id, in_reply_to_user_id, truncated, retweet_count, created_at, favorited, text, lang, in_reply_to_screen_name, source)
values ( :id, :in_reply_to_status_id, :in_reply_to_user_id, :truncated, :retweet_count, :created_at, :favorited, :text, :lang, :in_reply_to_screen_name, :source)
SQL
puts "Moving JSON to sqlite with high tech computering"
db.transaction do
parsed.each do |tweet|
tweet = tweet["tweet"]
insert.execute(
id: tweet["id_str"],
in_reply_to_status_id: tweet["in_reply_to_status_id"],
in_reply_to_user_id: tweet["in_reply_to_user_id"],
truncated: tweet["truncated"] ? 1 : 0,
retweet_count: tweet["retweet_count"],
created_at: Time.parse(tweet["created_at"]).iso8601,
favorited: tweet["favorited"] ? 1 : 0,
text: tweet["full_text"],
lang: tweet["lang"],
in_reply_to_screen_name: tweet["in_reply_to_screen_name"],
source: tweet["source"]
)
end
end
insert = db.prepare(<<~SQL)
insert into tweets_fts(rowid, id, text, created_at)
select tweets.rowid, tweets.id, tweets.text, tweets.created_at from tweets;
SQL
insert.execute
puts "\n\nSome things to try\n\n"
[
["# Tweets per year", %Q[select strftime("%Y", created_at), count(*) as count from tweets group by strftime("%Y", created_at) order by count desc]],
["Favorite reply to's", %Q[select in_reply_to_user_id, in_reply_to_screen_name, count(*) as cnt from tweets where in_reply_to_user_id <> "" group by in_reply_to_user_id order by cnt desc limit 10]],
["Most tweets in a single day", %Q[select count(*) as count, strftime("%Y%m%d", created_at) from tweets group by strftime("%j%Y", created_at) order by count desc limit 10]],
].each do |name, query|
puts name
puts
puts " sqlite3 #{output_db_path} '#{query}'"
puts
db.execute(query).each do |(thing, count)|
puts " #{thing} => #{count}"
end
end
puts <<~STR
Search for tweets with:
sqlite3 -column #{output_db_path} "select text, created_at, id from tweets where text like '%<term here>%'";
Or full text search with:
sqlite3 -column #{output_db_path} "select text, created_at, id from tweets_fts where text match '<term here>'";
STR
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment