Skip to content

Instantly share code, notes, and snippets.

@karmi
Created May 15, 2011 11:15
Show Gist options
  • Star 28 You must be signed in to star a gist
  • Fork 21 You must be signed in to fork a gist
  • Save karmi/973059 to your computer and use it in GitHub Desktop.
Save karmi/973059 to your computer and use it in GitHub Desktop.
Import your Gmail messages into ElasticSearch and search them with a simple web application.
.DS_Store
*.log
Gemfile.lock
source 'http://rubygems.org'
gem 'tire'
gem 'mime'
gem 'gmail'
gem 'sinatra'
# =======================================================
# Importing GMail messages into ElasticSearch
# =======================================================
#
# Import your GMail messages into ElasticSearch and search them with a simple web application.
#
# Requirements:
# -------------
#
# * ElasticSearch 0.16.x
# * Ruby 1.8.x
# * Rubygems
# * Bundler gem
#
# Usage:
# ------
#
# Install the required gems:
#
# $ bundle install
#
# Run this script to import your e-mail into ElasticSearch:
#
# $ ruby gmail-import.rb user@gmail.com yourpassword
#
# Note, that messages are fetched one by one, so the process depends on your connection.
# You may abort the process in any time and search already stored messages.
#
# Then launch the web application:
#
# $ INDEX=user@gmail.com ruby gmail-server.rb
#
# Open <http://localhost:4567/> in your browser.
#
#
require 'rubygems'
require 'time'
require 'iconv'
require 'tire'
require 'mime'
require 'gmail'
STDOUT.sync = true
USERNAME, PASSWORD = ARGV
unless (USERNAME && PASSWORD)
puts "[ERROR] Please provide your GMail credentials:", "",
" #{__FILE__} username@gmail.com password", ""
exit(1)
end
# Helper variables
#
@done = 0
@total = 0
@errors = []
# Helper method to display elapsed time
#
def elapsed_to_human(elapsed)
hour = 60*60
day = hour*24
case elapsed
when 0..59
"#{sprintf("%1.5f", elapsed)} seconds"
when 60..hour-1
"#{elapsed.to_i/60} minutes and #{elapsed.to_i % 60} seconds"
when hour..day
"#{elapsed.to_i/hour} hours and #{elapsed.to_i % hour} minutes"
else
"#{elapsed.to_i/hour} hours"
end
end
# Display import statistics
#
def report
["",
"Imported #{@done} messages into index: " +
"<http://localhost:9200/#{USERNAME}/_search?q=*> ",
"in #{elapsed_to_human(@elapsed)}. " +
"There were #{@errors.size} errors.",
""].join("\n")
end
# Clean exit on interrupt
#
trap(:INT) do
puts "\r\nExiting...\n"
puts report
exit( @errors.size > 0 ? 1 : 0 )
end
# Set up ElasticSearch index with the same name as your account
#
index = Tire.index USERNAME do
# Remove the indef if force set to true
#
delete if ENV['FORCE']
# Create the index for messages with proper mapping
#
create :mappings => {
:message => {
:properties => {
:id => { :type => 'string', :analyze => false },
:subject => { :type => 'string', :analyzer => 'snowball', :boost => 10 },
:from => { :type => 'multi_field',
:fields => { :from => { :type => 'string', :analyzer => 'snowball', :boost => 100 },
:exact => { :type => 'string', :analyze => false } }
},
:to => { :type => 'string', :analyzer => 'keyword' },
:date => { :type => 'date', },
:body => { :type => 'string', :analyzer => 'snowball' },
}
}
}
end
@elapsed = Benchmark.realtime do
# Helper method to strip non-UTF-8 characters
#
def force_utf(s)
Iconv.conv('UTF-8//IGNORE', 'UTF-8', s + ' ')[0..-2]
end
puts '-'*80, "Connecting to GMail account '#{USERNAME}'...", '-'*80
# Connect to GMail account
#
Gmail.new(USERNAME, PASSWORD) do |gmail|
@total = gmail.inbox.count
puts "Importing #{@total} messages, press Ctrl-C to abort...", '-'*80
# Process inbox messages one by one
#
gmail.inbox.emails.each do |email|
# Defensively define message properties (clean IDs, force UTF, etc)
#
document = {}
document[:id] = email.message_id.to_s.tr('<>', '').tr('/', '-')
document[:subject] = force_utf(email.subject.to_s)
document[:from] = Array(email.from).map { |a| "#{a.name} <#{a.mailbox}@#{a.host}>" }
document[:to] = Array(email.to).map { |a| "#{a.name} <#{a.mailbox}@#{a.host}>" }
document[:date] = (Time.parse(email.date).strftime('%Y-%m-%dT%H:%M:%S%z') rescue nil)
document[:body] = force_utf( (email.body.parts.first.body.to_s rescue email.body.to_s) )
begin
# Store the message in the index
#
index.store :message, document
@done += 1
puts "\e[32m#{@done.to_s.ljust(4)}\e[0m #{email.subject} <#{email.from_addrs.join(', ')}>"
rescue Exception => e
# Display failure message
#
puts "\e[31m[!]\e[0m #{email.subject} <#{email.from_addrs.join(', ')}>"
puts " #{e.inspect}"
@errors << email
end
end
end
end
puts report
# =======================================================
# Simple web application to search your GMail messages
# =======================================================
#
# Usage:
# ------
#
# First, import your messages with the `gmail-import.rb` script.
#
# Then, launch this application:
#
# $ INDEX=user@gmail.com ruby gmail-server.rb
#
#
require 'rubygems'
require 'tire'
require 'sinatra'
unless ENV['INDEX']
puts "[ERROR] Please set the index name with the INDEX environment variable:", "",
" $ INDEX=user@gmail.com ruby #{__FILE__}", ""
exit(1)
end
configure do
set :views, File.dirname(__FILE__)
set :per_page, 25
end
helpers do
def simple_format(text)
text.gsub!(/\r\n?/, "\n") # \r\n and \r -> \n
text.gsub!(/\n\n{2}/, "\n") # \n\n -> \n
text.gsub!(/\n\n+/, "</p>\n<p>") # \n -> paragraph
"<p>" + text + "</p>"
end
def link_to_unless(condition, name, url)
condition ? %Q|<a href="#{url}">#{name}</a>| : "#{name}"
end
def link_to_tip(query, legend)
%Q|<p class="tip"><a href="/?q=#{query}">#{query}</a><span>#{legend}</span></p>|
end
end
get '/' do
q = params[:q].to_s !~ /\S/ ? '*' : params[:q].to_s
s = params[:s] == 'date'
f = params[:p].to_i*settings.per_page
@s = Tire.search( ENV['INDEX'] ) do |search|
search.query { |query| query.string q }
search.highlight :subject => {:number_of_fragments => 0},
:body => {:number_of_fragments => 0},
:options => { :tag => '<em class="highlight">' }
search.sort { date :desc } if s
search.size settings.per_page
search.from f
end
# puts @s.to_curl
erb :results
end
<!DOCTYPE html>
<html>
<head>
<title>Search your GMail (<%= ENV['INDEX'] %>)</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<script src="http://code.jquery.com/jquery-1.6.1.min.js"></script>
<style>
body
{ color: #222; background: #fff;
font-size: 76%;
font-family: Helvetica, sans-serif;
padding: 2em 6em; }
a { color: #2f3b4c; text-decoration: none !important; }
h1
{ color: #999;
font-size: 120%;
padding: 0.5em 0.8em 0 0;
margin: 0;
float: left;
position: relative; }
h1 a { color: #999; }
#search-form
{ border-bottom: 2px solid #ccc;
padding: 0.5em 0 0.5em 0;
clear: both; }
#search-form input[type='text']
{ color: #222;
font-size: 110%;
padding: 0.25em;
width: 50em; }
#search-form #tools
{ color: #34383e;
margin: 0 0 0 11.6em; }
#search-form #tools a
{ color: #2f3b4c; text-decoration: underline !important; }
#search-form #tools .dim
{ color: #878787; }
#toggle-tips
{ font-size: 10px;
font-weight: normal;
text-decoration: underline !important;
position: absolute;
top: 0 bottom: 0; }
#search-form #tips
{ background-color: #eff0f1;
padding: 1em 2em;
margin: 0 0 0 11.6em;
position: relative;
-moz-border-radius: 0.5em;
-webkit-border-radius: 0.5em;
border-radius: 0.5em; }
#search-form #tips p
{ padding: 0.5em 0 0.5em 0;
margin: 0; }
#search-form #tips a
{ background: #B9D4FA;
padding: 0.25em 0.5em 0.1em 0.5em;
-moz-border-radius: 0.25em;
-webkit-border-radius: 0.25em;
border-radius: 0.25em; }
#search-form #tips a:hover
{ color: #dde4ed;
background: #444e5d; }
#search-form #tips span
{ color: #878787;
font-size: 95%;
margin-left: 1em; }
.message
{ line-height: 125%;
padding: 1em 0;
border-bottom: 1px solid #ccc;
position: relative; }
.message p
{ margin: 0 0 0.5em 0; }
.message .from
{ color: #34383e;
font-weight: bold;
float: left; }
.message .from small
{ color: #5f646b;
font-weight: normal; }
.message .date
{ color: #5976a1;
float: right; }
.message .subject
{ color: #34383e;
clear: both; }
.message .body
{ color: #87858f;
font-size: 95%;
height: 1.25em;
overflow: hidden; }
.message .body p
{ display: inline; }
.message.expanded .body
{ height: auto; }
.message.expanded .body p
{ display: block; }
.message:hover
{ background: #f5f5f8; }
.highlight {
font-size: normal;
background-color: #fef4c1;
padding: 0.25em 0.25em;
-moz-border-radius: 0.25em;
-webkit-border-radius: 0.25em;
border-radius: 0.25em;
}
</style>
<script>
$(function() {
$('#tips').hide();
$('.message .body').
hover(function() { $(this).css({ cursor : 'pointer' }); }).
click(function() { $(this).parent().toggleClass('expanded'); return false; });
$('#toggle-tips').
click(function() { $('#tips').toggle('fast'); return false; });
});
</script>
</head>
<body>
<div id="search-form">
<h1>
<a href="/">Search your GMail</a><br>
<a id="toggle-tips" href="#">Toggle tips</a>
</h1>
<form action="/" method="get" accept-charset="utf-8">
<input type="hidden" name="s" value="<%= params[:s] %>">
<input type="text" name="q" value="<%= params[:q] %>">
<input type="submit" value="Search">
</form>
<div id="tools">
<p>
<span class="dim">Sort by:</span>
<%= link_to_unless params[:s] =~ /\S/, 'relevance', "/?q=#{params[:q]}" %> <span class="dim">or</span>
<%= link_to_unless params[:s] !~ /\S/, 'date', "/?q=#{params[:q]}&amp;s=date" %>
<span class="dim">. Showing <%= @s.results.size %> of <%= @s.results.total %> total results.</span>
</p>
</div>
<div id="tips">
<%= link_to_tip('git*', 'Messages beginning with “git”') %>
<%= link_to_tip('from:github.com', 'Messages from Github') %>
<%= link_to_tip('apple OR linux^100', 'Messages about Apple or Linux, with a boost for Linux') %>
<%= link_to_tip("date:[#{(Time.now-7*24*60*60).strftime('%Y-%m-%d')} TO #{Time.now.strftime('%Y-%m-%d')}]", 'Messages from last week') %>
</div>
</div>
<% @s.results.each do |m| %>
<div class="message">
<p class="from">
<%= m.from %>
<% if m._score && m._score != 1.0 %>
<small title="score"><%= m._score.inspect %></small>
<% end %>
</p>
<p class="date"><%= Time.parse(m.date).strftime('%Y/%m/%d %H:%M') %></p>
<% body = (m.highlight && m.highlight.body) ? m.highlight.body.first : m.body %>
<% subject = (m.highlight && m.highlight.subject) ? m.highlight.subject.first : m.subject %>
<p class="subject"><%= subject %></p>
<div class="body"><%= simple_format(body) %></div>
</div>
<% end %>
<% if @s.results.total > (params[:p].to_i+1)*settings.per_page %>
<p><a href="/?q=<%= params[:q] %>&amp;s=<%= params[:s] %>&amp;p=<%= params[:p].to_i+1 %>">Next &raquo;</a></p>
<% end %>
<% if @s.results.empty? %>
<p>No results.</p>
<% end %>
</body>
</html>
@vijaydev
Copy link

I'm getting an error while running the import script: # Errno::ECONNREFUSED: Connection refused - connect(2)

@karmi
Copy link
Author

karmi commented May 15, 2011

Which line, details? Either Gmail or ES is inaccessible then.

@vijaydev
Copy link

I can't see any line numbers in the output. I've not yet tinkered with the script. These are the first few lines:

Connecting to Gmail account 'vijaydev.cse@gmail.com'...
Importing 1337 messages, press Ctrl-C to abort...
[!] Gmail is different. Here's what you need to know. mail-noreply@google.com
#<Errno::ECONNREFUSED: Connection refused - connect(2)>
[!] It's easy to switch to Gmail! mail-noreply@google.com
#<Errno::ECONNREFUSED: Connection refused - connect(2)>

So we can see, it did connect to Gmail (the script continues to list subjects of my messages.. )

@karmi
Copy link
Author

karmi commented May 15, 2011

And ES is running on http://localhost:9200 ?

@vijaydev
Copy link

facepalm! That wasnt running and I thought it was! Sorry! Indexing is on :)

@karmi
Copy link
Author

karmi commented May 15, 2011

Great!

@josegonzalez
Copy link

I get the following lovely error:

--------------------------------------------------------------------------------
Connecting to Gmail account 'mail@redact.ed'...
--------------------------------------------------------------------------------
/Users/jose/.rvm/rubies/ruby-1.9.2-p180/lib/ruby/1.9.1/net/imap.rb:1101:in `get_tagged_response': Unknown command f1if658670vbm.74 (Net::IMAP::BadResponseError)
from /Users/jose/.rvm/rubies/ruby-1.9.2-p180/lib/ruby/1.9.1/net/imap.rb:1153:in `block in send_command'
from /Users/jose/.rvm/rubies/ruby-1.9.2-p180/lib/ruby/1.9.1/monitor.rb:201:in `mon_synchronize'
from /Users/jose/.rvm/rubies/ruby-1.9.2-p180/lib/ruby/1.9.1/net/imap.rb:1135:in `send_command'
from /Users/jose/.rvm/rubies/ruby-1.9.2-p180/lib/ruby/1.9.1/net/imap.rb:437:in `block in select'
from /Users/jose/.rvm/rubies/ruby-1.9.2-p180/lib/ruby/1.9.1/monitor.rb:201:in `mon_synchronize'
from /Users/jose/.rvm/rubies/ruby-1.9.2-p180/lib/ruby/1.9.1/net/imap.rb:435:in `select'
from /Users/jose/.rvm/gems/ruby-1.9.2-p180/gems/gmail-0.4.0/lib/gmail/client/base.rb:199:in `switch_to_mailbox'
from /Users/jose/.rvm/gems/ruby-1.9.2-p180/gems/gmail-0.4.0/lib/gmail/client/base.rb:157:in `block in mailbox'
from <internal:prelude>:10:in `synchronize'
from /Users/jose/.rvm/gems/ruby-1.9.2-p180/gems/gmail-0.4.0/lib/gmail/client/base.rb:154:in `mailbox'
from /Users/jose/.rvm/gems/ruby-1.9.2-p180/gems/gmail-0.4.0/lib/gmail/client/base.rb:177:in `inbox'
from gmail-import.rb:137:in `block (2 levels) in <main>'
from /Users/jose/.rvm/gems/ruby-1.9.2-p180/gems/gmail-0.4.0/lib/gmail.rb:75:in `perform_block'
from /Users/jose/.rvm/gems/ruby-1.9.2-p180/gems/gmail-0.4.0/lib/gmail.rb:50:in `new'
from gmail-import.rb:135:in `block in <main>'
from /Users/jose/.rvm/rubies/ruby-1.9.2-p180/lib/ruby/1.9.1/benchmark.rb:309:in `realtime'
from gmail-import.rb:123:in `<main>'

Any ideas?

@karmi
Copy link
Author

karmi commented May 20, 2011 via email

@josegonzalez
Copy link

Oddly enough, I was getting lovely 1.8.7 errors for "gem install bundler" (new macbook). Maybe it's an rvm issue?

@karmi
Copy link
Author

karmi commented May 21, 2011

@josegonzalez: Definitely weird, I'm on 1.8.7 as well.

@ericTsiliacos
Copy link

awesome stuff! However, after running $ INDEX=user@gmail.com ruby gmail-server.rb with my username I got the following output:

--------------------------------------------------------------------------------
Connecting to Gmail account 'myemail@gmail.com'...
--------------------------------------------------------------------------------
Importing 901 messages, press Ctrl-C to abort...
--------------------------------------------------------------------------------
[!]  Get started with Gmail <mail-noreply@google.com>
     #<ArgumentError: Please pass a JSON string or object with a 'to_indexed_json' method,'Symbol' given.>
[!]  Access Gmail on your mobile phone <mail-noreply@google.com>
     #<ArgumentError: Please pass a JSON string or object with a 'to_indexed_json' method,'Symbol' given.>
[!]  Import your contacts and old email <mail-noreply@google.com>
     #<ArgumentError: Please pass a JSON string or object with a 'to_indexed_json' method,'Symbol' given.>
...

I can't seem to figure out what the problem is. Any ideas? thank you!

@karmi
Copy link
Author

karmi commented Jun 2, 2012

Did you run the ruby gmail-import.rb user@gmail.com yourpassword script first?

@ericTsiliacos
Copy link

Yah, I ran that first. It seems to be connecting to gmail, but isn't reading them correctly. It even seems to be reading the subject line of each email but then throws an error when it tries to parse the email I think.

And then after I abort I get the following message even after leaving it running for awhile where the number of errors is the total number of emails it attempted to read before I aborted. It is able to find the subject line of an email but then throws the above mentioned error.

Imported 0 messages into index: http://localhost:9200/my_email/_search?q=*
in 0 hours. There were 8 errors.

@ericTsiliacos
Copy link

I found the error. In gmail-import.rb, line 158 was throwing the error. The above code has

index.store :message, document

The following fixed it for me:

index.store :message => document

@masukav
Copy link

masukav commented Apr 11, 2014

Returns TypeError at /
no implicit conversion of nil into String for time.rb _parse.

d = Date._parse(date, comp)

How can this be avoided?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment