-
-
Save dylanmckay/2b191a10068bd87d0fffba242db44b52 to your computer and use it in GitHub Desktop.
#! /usr/bin/env ruby | |
# NOTE: Requires Ruby 2.1 or greater. | |
# This script can be used to parse and dump the information from | |
# the 'html/contact_info.htm' file in a Facebook user data ZIP download. | |
# | |
# It prints all cell phone call + SMS message + MMS records, plus a summary of each. | |
# | |
# It also dumps all of the records into CSV files inside a 'CSV' folder, that is created | |
# in whatever the working directory of the program is when executed. | |
# | |
# Place this script inside the extracted Facebook data download folder | |
# alongside the 'html' folder. | |
# | |
# This script requires Ruby and the Nokogiri library to be installed. | |
# | |
# Open source licensing | |
# --------------------- | |
# | |
# Dual-licensed under the MIT and Apache 2.0 open source licenses. Either license can be chosen | |
# by any user of the program. | |
# | |
# The MIT license is duplicated here, the Apache 2.0 license can be found here | |
# https://opensource.org/licenses/Apache-2.0 | |
# | |
# The MIT License (MIT) | |
# Copyright (c) 2018 Dylan McKay | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated | |
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation | |
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, | |
# and to permit persons to whom the Software is furnished to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE | |
# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS | |
# OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR | |
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |
require 'nokogiri' | |
require 'time' | |
require 'fileutils' | |
def hr | |
$stdout.puts "-" * 24 | |
end | |
def indent(level = 1) | |
$stdout.print " " * (level - 1) | |
$stdout.flush | |
end | |
def section(title, level: 1) | |
indent(level) and hr | |
indent(level) and $stdout.puts title | |
indent(level) and $stdout.puts | |
yield | |
indent(level) and hr | |
indent(level) and $stdout.puts | |
end | |
# Extracts metadata from a call/text/sms/mms table | |
# Returns nil if there is no metadata in this table. | |
# Returns a 2d list of row/colums | |
def extract_table_metadata(metadata_table) | |
headings = metadata_table.css('tr').first.css('th').map(&:text).map(&:chomp) | |
records = metadata_table.css('tr')[1..-1] | |
return nil if records.size <= 1 # many tables are empty (excluding headings). | |
[headings] + records.map do |call_record| | |
call_record.css('td').map(&:text).map(&:chomp).map do |field| | |
if field.include? ' at ' # some fields are dates/times | |
# Time example: "Wednesday, 14 June 2017 at 19:02 UTC+12" | |
Time.strptime(field, "%A, %e %B %Y at %R UTC%z") rescue field | |
else | |
field # no special processing | |
end | |
end | |
end | |
end | |
def dig_out_metadata(container:) | |
# If a specific type of metadata is missing (calls, texts, ..), the | |
# container div will simply not be present. | |
return [] if container.nil? | |
contact_tables = container.children.select { |c| c.name == "table" } | |
contact_tables.map do |contact_table| | |
metadata_table = contact_table.css('table')[0] | |
extract_table_metadata(metadata_table) | |
end.compact.select { |t| t.size > 1 } # must include non-header rows | |
end | |
def print_metadata(metadata, metadata_title:) | |
section(metadata_title) do | |
metadata.each do |phone_records| | |
puts | |
indent(2) and puts "Another phone number" | |
puts | |
phone_records.each do |record| | |
indent(2) and puts record.join(", ") | |
end | |
end | |
end | |
end | |
def print_timestamps(metadata, metadata_name:) | |
timestamps = metadata.map { |r| r[1].to_s.chomp }.select { |s| s.size > 0 }.map do |t| | |
begin | |
Time.parse(t) | |
rescue ArgumentError # do not parse timestamp if unparseable | |
t | |
end | |
end | |
if timestamps.size > 0 | |
puts "The oldest #{metadata_name} is from #{timestamps.min.to_date}, the most recent at #{timestamps.max.to_date}" | |
end | |
end | |
def print_status_breakdown(metadata, metadata_name:) | |
grouped_statuses = metadata.flatten(1).group_by(&:first) | |
if grouped_statuses.size > 0 | |
puts "This includes " + grouped_statuses.map { |status,records| "#{records.size} #{status.downcase} #{metadata_name}"}.join(", ") | |
end | |
end | |
def metadata_to_csv(metadata) | |
metadata.flatten(1).map { |record| record.join(',') }.join("\n") | |
end | |
def dump_metadata_csv(html_doc) | |
call_history_container = html_doc.xpath("//h2[text()='Call History']/following-sibling::div")[0] | |
sms_history_container = html_doc.xpath("//h2[text()='SMS History']/following-sibling::div")[0] | |
mms_history_container = html_doc.xpath("//h2[text()='MMS History']/following-sibling::div")[0] | |
FileUtils.mkdir_p("csv") | |
call_metadata = dig_out_metadata(:container => call_history_container) | |
sms_metadata = dig_out_metadata(:container => sms_history_container) | |
mms_metadata = dig_out_metadata(:container => mms_history_container) | |
File.write(File.join("csv", "call.csv"), metadata_to_csv(call_metadata)) | |
File.write(File.join("csv", "sms.csv"), metadata_to_csv(sms_metadata)) | |
File.write(File.join("csv", "mms.csv"), metadata_to_csv(mms_metadata)) | |
end | |
def print_metadata_human(html_doc) | |
call_history_container = html_doc.xpath("//h2[text()='Call History']/following-sibling::div")[0] | |
sms_history_container = html_doc.xpath("//h2[text()='SMS History']/following-sibling::div")[0] | |
mms_history_container = html_doc.xpath("//h2[text()='MMS History']/following-sibling::div")[0] | |
call_metadata = dig_out_metadata(:container => call_history_container) | |
sms_metadata = dig_out_metadata(:container => sms_history_container) | |
mms_metadata = dig_out_metadata(:container => mms_history_container) | |
if call_history_container | |
phone_numbers = call_history_container.xpath("//b[text()='Number:']/following-sibling::text()") | |
.map(&:text).sort.uniq | |
else | |
phone_numbers = [] | |
end | |
print_metadata(call_metadata, :metadata_title => "Call History") | |
print_metadata(sms_metadata, :metadata_title => "SMS History") | |
print_metadata(mms_metadata, :metadata_title => "MMS History") | |
section("The full list of phone numbers that have stored data") do | |
phone_numbers.each_slice(8).to_a.map { |g| g.join(", ") }.each do |line| | |
indent(2) and $stdout.puts line | |
end | |
end | |
$stdout.puts "A brief summary of phone records" | |
hr | |
$stdout.puts "There are phone records for #{phone_numbers.size} distinct phone numbers" | |
$stdout.puts "There are records of #{call_metadata.flatten(1).size} distinct cell phone calls" | |
indent(2) and print_timestamps(call_metadata, :metadata_name => "cell phone call") | |
indent(2) and print_status_breakdown(call_metadata, :metadata_name => "cell phone calls") | |
$stdout.puts "There are records of #{sms_metadata.flatten(1).size} distinct SMS messages" | |
indent(2) and print_timestamps(sms_metadata, :metadata_name => "SMS message") | |
indent(2) and print_status_breakdown(sms_metadata, :metadata_name => "SMS messages") | |
$stdout.puts "There are records of #{mms_metadata.flatten(1).size} distinct MMS messages" | |
indent(2) and print_timestamps(mms_metadata, :metadata_name => "MMS message") | |
indent(2) and print_status_breakdown(mms_metadata, :metadata_name => "MMS messages") | |
hr | |
end | |
html_text = File.read('html/contact_info.htm') | |
html_doc = Nokogiri::HTML(html_text) | |
print_metadata_human(html_doc) | |
$stdout.puts | |
hr | |
$stdout.puts "dumped metadata to CSV files at #{Dir.pwd}/csv" | |
dump_metadata_csv(html_doc) | |
Is there a way we could just pull this data from an API without downloading the zip ?
Question for the group... I did change the default prompts during the install from C: to D: as I have more space on D. I don't seem to think it would be the problem.
Sounds like you did everything right - there aren't any metadata records in the data dump, which is good.
You can confirm this by looking at html/contact_info.htm
. Scroll through the page - you shouldn't see any "Call History" or "SMS History" headings
@nachopants that looks like the "contacts" table.
That is separate to the call+sms metadata sections.
If you have no contacts, the table you are inspecting via Chrome Dev tools would not exist.
If there are call records, there should be a <h2>Call History</h2>
, or a <h2>SMS History</h2>
.
If you cannot see the latter, it sounds like you probably don't have any data stored.
Nope - the metadata does not get exposed to third parties, therefore it is not in the API. You may be able to use the API to request a list just of contacts though.
@JohnAmbra I've pushed a new version of the script that should ignore any time parse errors and use the original time format outputted by FB.
Thanks for your kind comments!
I concur with @JohnAmbra, thank you for doing this. In light of the situation over the last few days, thank you for putting this together.
I did confirm that I have no Call history or SMS history so thank you for the confirmation.
As a related enhancement, have you thought of possibly extending the script so that it converts each of the HTML pages into respective CSV pages to facilitate our own review?
@dylanmckay
Thanks for creating this script man, I ran it against the zip file I downloaded today and it did not find any records of txt or calls.
Could it be possible that facebook updated their side to not include that data in the archived zip anymore?
Maybe people that already verified they had txt/calls tracked can re-download their archive to verify they are still there. This will confirm if facebook is doing damage control?
@mforce
I had the same concern, but I'm wondering if it's related to the type of device used? I've been using iOS for quite some time, and with the difference in security prompts and easy access to those controls, maybe Android OS devices (for example) have been more susceptible than iOS devices?
I did the same process that it sounds like you did and found no call records and no sms records, which I was expecting I'd encounter. So either I'm doing something wrong, the records don't exist, or (as you stated) Facebook has stopped providing those records.
@davisx23 And yet I crack open the files (since they're html) and there are plenty of phone number there (some I thought I'd long lost but I'm still mad AF!).
Run the script and nothing found.
The reason why script returns no data is because it's reading contact_info.htm file and trying to get information from sections titled "Call History", "SMS History", "MMS History". If they are not present in those sections, script will return no data.
I concur with @JohnAmbra, thank you for doing this. In light of the situation over the last few days, thank you for putting this together.
Thanks!
As a related enhancement, have you thought of possibly extending the script so that it converts each of the HTML pages into respective CSV pages to facilitate our own review?
It already has this! Check out the csv
folder in the same folder when you run the script.
It does have duplicated "Status, Timestamp, Duration, Contact name" headings for each mobile number, but those are easy to filter out manually.
Thanks for creating this script man, I ran it against the zip file I downloaded today and it did not find any records of txt or calls.
Could it be possible that facebook updated their side to not include that data in the archived zip anymore?Maybe people that already verified they had txt/calls tracked can re-download their archive to verify they are still there. This will confirm if facebook is doing damage control?
To be honest I doubt it. It'd be way too obvious to simply start hiding this data, and the PR nightmare it'd cause makes the decision an obvious bad one for any business IMO.
I've also redownloaded my data (~2 days ago) and confirmed that the metadata was still there. This is obviously just one data point though.
You need to be careful when doing this because FB seems to cache the ZIP for ~5 days. That means in order to get a freshly built ZIP based on current data, you need to wait a while before requesting it.
I think it's fair to say that the metadata was never collected on your account.
I had the same concern, but I'm wondering if it's related to the type of device used? I've been using iOS for quite some time, and with the difference in security prompts and easy access to those controls, maybe Android OS devices (for example) have been more susceptible than iOS devices?
It seems that way. Of all of the people I've seen with metadata downloads, I can't recall a single one who mentioned iOS, but I've seen dozens of tweets specifically mentioning collected metadata and Android.
Any help with this error? I have downloaded Ruby, installed nokogiri but still do not get an output. I manually check the files and do not see anything but i want to understand how to make it work. I want to be able to check out files for my family members.
I am totally unfamiliar with coding, but am computer literate. I've downloaded Ruby and the FB data, but have no idea how to get it going. Please help. I'm using Win 7, SP1, 64 bit OS.
I am receiving the following error when executing the script:
./facebook-contact-info-summary.rb:55: syntax error, unexpected tLABEL
def section(title, level: 1)
^
@AMurray8 place the script next to the html
directory - it looks like it's currently inside of it.
Currently, it is looking for html/html/index.html
, but if it is moved up a level it should work.
You're using Ruby <2.0, the script only runs on Ruby 2.1+, which added support for keyword arguments.
It took a lot of trial and error to finally get Ruby 2.4.1 installed on my MBPro running Sierra, so that I could use this script, but it now works. Most of my problems were due to file ownership issues in the /usr/local/lib and /usr/local/sbin folders that prevented HomeBrew from working properly. Once that was resolved, the rest went smoothly.
It seems that they have cleaned up any damning evidence in the Facebook Data download that they are now releasing to their Users.
I don't know if they have deleted any of the Private information from their systems (ie. phone calls, SMS messages, etc), that some people have seen in earlier downloads, but they are no longer including that data in the ZIP file that Users can download from Facebook.
I hope that Apple gives them hell about recording User activity (phone calls and messaging) that was unrelated to the Facebook App. That is a BIG violation of the Apple Developer Agreement.
I've been a Facebook user since 2007, in the US, with the FB and Messenger app on both iOS and Android phones for much of that time. My data contains none of the phone records in the initial @dylanmckay example. I'm guessing it's a phone or messaging service I never opted into?
My total Facebook data download was about 230 MB uncompressed. In comparison, I just requested my Google data dump and it's 321 GB, of compressed data. I can't wait to learn what that one has in it.
Thank's for putting this script together @dylanmckay
I downloaded a new copy of my fb data today (i'm in europe), and the structure of the files and folders has completely changed since the last time i did it (and obviously the script isn't working, since the file the script tries to parse does not exist)
Am I alone in this situation ?
The place to download one's fb data has changed. It seems to change often. Three ways that currently work:
At the moment, you can save steps and time: A & B: go straight to step 3.1. C:
- Start at almost any facebook page; click on the "?" in the top right.
- Click "Privacy Shortcuts" (third from the bottom), which opens https://www.facebook.com/privacy/ . You are in a twisty maze of passages, all almost alike.
- Click "Access your information" - (which is first under the "Your Facebook Information" (fourth) section (which states: View or download your Facebook information at any time."; I had to laugh when I noticed later that the short URL https://www.facebook.com/dyi/ redirects there)) in order to ...
3.1. A: load https://www.facebook.com/your_information/ (Or, if that fails: B: Load https://www.facebook.com/dyi/ and ...) - Click "download your information" - which is B)in the first box of text, near the bottom, OR B)in the second row. (This opens a unique URL like https://www.facebook.com/dyi/?x=Admungedmungedei ... which you can bookmark - once you have the URL with the value of x that works for your account. )
- There, one is given the CHOICE of HTML or JSON format, and many other choices; with defaults:
Date Range:All of my data
Format:HTML
Media Quality:Medium
Among the dozens items (Your Information:20, Information About You:6) is this one (#23):
Calls and Messages
Logs of your calls and messages that you've chosen to share in your device settings !!!
Accessed while in California, today.
It would be interesting to learn what fb goes by when it decides what to store, and changes what it makes available (and/or how) - (e.g. current IP geolocation, stated country of residence, usual IP geolocation, etc.) Clearly dates are important, as the survey results show. Speaking of those results - see my following comment. (It's a separate subtopic, so I'm posting separately.)
And THANK YOU, Dylan.
Dylan, do you want to write / would you like/accept a crowdfunded X$ for a new script (and thanks for the old one)?
The google form (questionnaire) has been closed*; results: http://archive.is/yb7xS; I like the cute "This form was created inside of Dylan." tweak to it. Wait--super freaky. The archive is showing different data! The live page and archive both show: Q1: Is there any call, text, or MMS metadata? 1,716 responses, 76% no, and Q2 data, but the archive lacks all the other data, and FALSELY displays "No responses yet for this question". (Not surprisingly, the IA / archive.org won't save it, even manually, because it's programmed to consider itself a robot/respect robots.txt; It doesn't make an exception when run manually.)
Select the type(s) of metadata that were collected 865 responses
Cellular calls 277 (32%)
MMS messages 269 (31%)
None 194 (22%)
ALL 763 (88%)
can't leave blank 7 ( 1%) ...
*"The form Facebook data dump contact information is no longer accepting responses.
Try contacting the owner of the form if you think this is a mistake."
I later found a partial archive at https://web-beta.archive.org/web/20190215193157/https://docs.google.com/forms/d/1g_K4yurJqCvBSknQHMdkvX-0zEEA2vMs5ZKnScaQ1mg/viewanalytics
-It shows (for me, currently) the answers to Q3 and Q4 - When does the metadata [start|end] (if at all) - Most common starts: 7:1/1/09. 10:1/1/18. 7:3/26/18.(data entry error?) (rest: <5). Common ends:8 days in Mar 2018, 3/21-3/29.
(I then pointed WebCite at it, and obtained http://www.webcitation.org/76DCI2a2u which has 0 question answers.
I then tried again, and obtained http://archive.fo/FrWv4 which was like the archive.is result.)
After I requested a download, facebook asked me:
We'd like to hear from you!
We're asking a small group of people about their experience here. Could you help us out by letting us know your reasons for downloading your information today? You can choose any or all of the following:
- To understand what information is available to be downloaded
- To keep a record of what you've shared on facebook
- To transfer your information to another service or app
If none of these apply, let us know your reason here:
Thanks for helping us improve Facebook!
I just closed the window.
As is the case with many others, when running the script I get this:
Call History
SMS History
MMS History
The full list of phone numbers that have stored data
A brief summary of phone records
There are phone records for 0 distinct phone numbers
There are records of 0 distinct cell phone calls
There are records of 0 distinct SMS messages
There are records of 0 distinct MMS messages
Does this mean they are no longer including that data in the dump, or that they never collected it for me? Anyone have any ideas?