Skip to content

Instantly share code, notes, and snippets.

@karmi
Created May 15, 2011 11:15
Show Gist options
  • Save karmi/973059 to your computer and use it in GitHub Desktop.
Save karmi/973059 to your computer and use it in GitHub Desktop.
Import your Gmail messages into ElasticSearch and search them with a simple web application.
.DS_Store
*.log
Gemfile.lock
source 'http://rubygems.org'
gem 'tire'
gem 'mime'
gem 'gmail'
gem 'sinatra'
# =======================================================
# Importing Gmail messages into ElasticSearch
# =======================================================
#
# Import your Gmail messages into ElasticSearch and search them with a simple web application.
#
# Requirements:
# -------------
#
# * ElasticSearch 0.16.x
# * Ruby 1.8.x
# * Rubygems
# * Bundler gem
#
# Usage:
# ------
#
# Install the required gems:
#
# $ bundle install
#
# Run this script to import your e-mail into ElasticSearch:
#
# $ ruby gmail-import.rb user@gmail.com yourpassword
#
# Note, that messages are fetched one by one, so the process depends on your connection.
# You may abort the process in any time and search already stored messages.
#
# Then launch the web application:
#
# $ INDEX=user@gmail.com ruby gmail-server.rb
#
# Open <http://localhost:4567/> in your browser.
#
#
require 'rubygems'
require 'time'
require 'iconv'
require 'tire'
require 'mime'
require 'gmail'
STDOUT.sync = true
USERNAME, PASSWORD = ARGV
unless (USERNAME && PASSWORD)
puts "[ERROR] Please provide your Gmail credentials:", "",
" #{__FILE__} username@gmail.com password", ""
exit(1)
end
# Helper variables
#
@done = 0
@total = 0
@errors = []
# Helper method to display elapsed time
#
def elapsed_to_human(elapsed)
hour = 60*60
day = hour*24
case elapsed
when 0..59
"#{sprintf("%1.5f", elapsed)} seconds"
when 60..hour-1
"#{elapsed.to_i/60} minutes and #{elapsed.to_i % 60} seconds"
when hour..day
"#{elapsed.to_i/hour} hours and #{elapsed.to_i % hour} minutes"
else
"#{elapsed.to_i/hour} hours"
end
end
# Display import statistics
#
def report
["",
"Imported #{@done} messages into index: " +
"<http://localhost:9200/#{USERNAME}/_search?q=*> ",
"in #{elapsed_to_human(@elapsed)}. " +
"There were #{@errors.size} errors.",
""].join("\n")
end
# Clean exit on interrupt
#
trap(:INT) do
puts "\r\nExiting...\n"
puts report
exit( @errors.size > 0 ? 1 : 0 )
end
# Set up ElasticSearch index with the same name as your account
#
index = Tire.index USERNAME do
# Remove the indef if force set to true
#
delete if ENV['FORCE']
# Create the index for messages with proper mapping
#
create :mappings => {
:message => {
:properties => {
:id => { :type => 'string', :index => 'not_analyzed', :store => true },
:subject => { :type => 'string', :analyzer => 'snowball', :boost => 10 },
:from => { :type => 'multi_field',
:fields => { :from => { :type => 'string', :analyzer => 'snowball', :boost => 100 },
:exact => { :type => 'string', :index => 'not_analyzed', :store => true } }
},
:to => { :type => 'string', :analyzer => 'keyword' },
:date => { :type => 'date', },
:body => { :type => 'string', :analyzer => 'snowball' },
}
}
}
end
@elapsed = Benchmark.realtime do
# Helper method to strip non-UTF-8 characters
#
def force_utf(s)
Iconv.conv('UTF-8//IGNORE', 'UTF-8', s + ' ')[0..-2]
end
puts '-'*80, "Connecting to Gmail account '#{USERNAME}'...", '-'*80
# Connect to Gmail account
#
Gmail.new(USERNAME, PASSWORD) do |gmail|
@total = gmail.inbox.count
puts "Importing #{@total} messages, press Ctrl-C to abort...", '-'*80
# Process inbox messages one by one
#
gmail.inbox.emails.each do |email|
# Defensively define message properties (clean IDs, force UTF, etc)
#
document = {}
document[:id] = email.message_id.to_s.tr('<>', '').tr('/', '-')
document[:subject] = force_utf(email.subject.to_s)
document[:from] = Array(email.from).map { |a| "#{a.name} <#{a.mailbox}@#{a.host}>" }
document[:to] = Array(email.to).map { |a| "#{a.name} <#{a.mailbox}@#{a.host}>" }
document[:date] = (Time.parse(email.date).strftime('%Y-%m-%dT%H:%M:%S%z') rescue nil)
document[:body] = force_utf( (email.body.parts.first.body.to_s rescue email.body.to_s) )
begin
# Store the message in the index
#
index.store :message, document
@done += 1
puts "\e[32m#{@done.to_s.ljust(4)}\e[0m #{email.subject} <#{email.from_addrs.join(', ')}>"
rescue Exception => e
# Display failure message
#
puts "\e[31m[!]\e[0m #{email.subject} <#{email.from_addrs.join(', ')}>"
puts " #{e.inspect}"
@errors << email
end
end
end
end
puts report
# =======================================================
# Simple web application to search your Gmail messages
# =======================================================
#
# Usage:
# ------
#
# First, import your messages with the `gmail-import.rb` script.
#
# Then, launch this application:
#
# $ INDEX=user@gmail.com ruby gmail-server.rb
#
#
require 'rubygems'
require 'tire'
require 'sinatra'
unless ENV['INDEX']
puts "[ERROR] Please set the index name with the INDEX environment variable:", "",
" $ INDEX=user@gmail.com ruby #{__FILE__}", ""
exit(1)
end
configure do
set :views, File.dirname(__FILE__)
set :per_page, 25
end
helpers do
def simple_format(text)
text.gsub!(/\r\n?/, "\n") # \r\n and \r -> \n
text.gsub!(/\n\n{2}/, "\n") # \n\n -> \n
text.gsub!(/\n\n+/, "</p>\n<p>") # \n -> paragraph
"<p>" + text + "</p>"
end
def link_to_unless(condition, name, url)
condition ? %Q|<a href="#{url}">#{name}</a>| : "#{name}"
end
def link_to_tip(query, legend)
%Q|<p class="tip"><a href="/?q=#{query}">#{query}</a><span>#{legend}</span></p>|
end
end
get '/' do
q = params[:q].to_s !~ /\S/ ? '*' : params[:q].to_s
s = params[:s] == 'date'
f = params[:p].to_i*settings.per_page
@s = Tire.search( ENV['INDEX'] ) do |search|
search.query { |query| query.string q }
search.highlight :subject => {:number_of_fragments => 0},
:body => {:number_of_fragments => 0},
:options => { :tag => '<em class="highlight">' }
search.sort { date :desc } if s
search.size settings.per_page
search.from f
end
# puts @s.to_curl
erb :results
end
<!DOCTYPE html>
<html>
<head>
<title>Search your Gmail (<%= ENV['INDEX'] %>)</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<script src="http://code.jquery.com/jquery-1.6.1.min.js"></script>
<style>
body
{ color: #222; background: #fff;
font-size: 76%;
font-family: Helvetica, sans-serif;
padding: 2em 6em; }
a { color: #2f3b4c; text-decoration: none !important; }
h1
{ color: #999;
font-size: 120%;
padding: 0.5em 0.8em 0 0;
margin: 0;
float: left;
position: relative; }
h1 a { color: #999; }
#search-form
{ border-bottom: 2px solid #ccc;
padding: 0.5em 0 0.5em 0;
clear: both; }
#search-form input[type='text']
{ color: #222;
font-size: 110%;
padding: 0.25em;
width: 50em; }
#search-form #tools
{ color: #34383e;
margin: 0 0 0 11.6em; }
#search-form #tools a
{ color: #2f3b4c; text-decoration: underline !important; }
#search-form #tools .dim
{ color: #878787; }
#toggle-tips
{ font-size: 10px;
font-weight: normal;
text-decoration: underline !important;
position: absolute;
top: 0 bottom: 0; }
#search-form #tips
{ background-color: #eff0f1;
padding: 1em 2em;
margin: 0 0 0 11.6em;
position: relative;
-moz-border-radius: 0.5em;
-webkit-border-radius: 0.5em;
border-radius: 0.5em; }
#search-form #tips p
{ padding: 0.5em 0 0.5em 0;
margin: 0; }
#search-form #tips a
{ background: #B9D4FA;
padding: 0.25em 0.5em 0.1em 0.5em;
-moz-border-radius: 0.25em;
-webkit-border-radius: 0.25em;
border-radius: 0.25em; }
#search-form #tips a:hover
{ color: #dde4ed;
background: #444e5d; }
#search-form #tips span
{ color: #878787;
font-size: 95%;
margin-left: 1em; }
.message
{ line-height: 125%;
padding: 1em 0;
border-bottom: 1px solid #ccc;
position: relative; }
.message p
{ margin: 0 0 0.5em 0; }
.message .from
{ color: #34383e;
font-weight: bold;
float: left; }
.message .from small
{ color: #5f646b;
font-weight: normal; }
.message .date
{ color: #5976a1;
float: right; }
.message .subject
{ color: #34383e;
clear: both; }
.message .body
{ color: #87858f;
font-size: 95%;
height: 1.25em;
overflow: hidden; }
.message .body p
{ display: inline; }
.message.expanded .body
{ height: auto; }
.message.expanded .body p
{ display: block; }
.message:hover
{ background: #f5f5f8; }
.highlight {
font-size: normal;
background-color: #fef4c1;
padding: 0.25em 0.25em;
-moz-border-radius: 0.25em;
-webkit-border-radius: 0.25em;
border-radius: 0.25em;
}
</style>
<script>
$(function() {
$('#tips').hide();
$('.message .body').
hover(function() { $(this).css({ cursor : 'pointer' }); }).
click(function() { $(this).parent().toggleClass('expanded'); return false; });
$('#toggle-tips').
click(function() { $('#tips').toggle('fast'); return false; });
});
</script>
</head>
<body>
<div id="search-form">
<h1>
<a href="/">Search your Gmail</a><br>
<a id="toggle-tips" href="#">Toggle tips</a>
</h1>
<form action="/" method="get" accept-charset="utf-8">
<input type="hidden" name="s" value="<%= params[:s] %>">
<input type="text" name="q" value="<%= params[:q] %>">
<input type="submit" value="Search">
</form>
<div id="tools">
<p>
<span class="dim">Sort by:</span>
<%= link_to_unless params[:s] =~ /\S/, 'relevance', "/?q=#{params[:q]}" %> <span class="dim">or</span>
<%= link_to_unless params[:s] !~ /\S/, 'date', "/?q=#{params[:q]}&amp;s=date" %>
<span class="dim">. Showing <%= @s.results.size %> of <%= @s.results.total %> total results.</span>
</p>
</div>
<div id="tips">
<%= link_to_tip('git*', 'Messages beginning with “git”') %>
<%= link_to_tip('from:github.com', 'Messages from Github') %>
<%= link_to_tip('apple OR linux^100', 'Messages about Apple or Linux, with a boost for Linux') %>
<%= link_to_tip("date:[#{(Time.now-7*24*60*60).strftime('%Y-%m-%d')} TO #{Time.now.strftime('%Y-%m-%d')}]", 'Messages from last week') %>
</div>
</div>
<% @s.results.each do |m| %>
<div class="message">
<p class="from">
<%= m.from %>
<% if m._score && m._score != 1.0 %>
<small title="score"><%= m._score.inspect %></small>
<% end %>
</p>
<p class="date"><%= Time.parse(m.date).strftime('%Y/%m/%d %H:%M') %></p>
<% body = (m.highlight && m.highlight.body) ? m.highlight.body.first : m.body %>
<% subject = (m.highlight && m.highlight.subject) ? m.highlight.subject.first : m.subject %>
<p class="subject"><%= subject %></p>
<div class="body"><%= simple_format(body) %></div>
</div>
<% end %>
<% if @s.results.total > (params[:p].to_i+1)*settings.per_page %>
<p><a href="/?q=<%= params[:q] %>&amp;s=<%= params[:s] %>&amp;p=<%= params[:p].to_i+1 %>">Next &raquo;</a></p>
<% end %>
<% if @s.results.empty? %>
<p>No results.</p>
<% end %>
</body>
</html>
@karmi
Copy link
Author

karmi commented Jun 2, 2012

Did you run the ruby gmail-import.rb user@gmail.com yourpassword script first?

@ericTsiliacos
Copy link

Yah, I ran that first. It seems to be connecting to gmail, but isn't reading them correctly. It even seems to be reading the subject line of each email but then throws an error when it tries to parse the email I think.

And then after I abort I get the following message even after leaving it running for awhile where the number of errors is the total number of emails it attempted to read before I aborted. It is able to find the subject line of an email but then throws the above mentioned error.

Imported 0 messages into index: http://localhost:9200/my_email/_search?q=*
in 0 hours. There were 8 errors.

@ericTsiliacos
Copy link

I found the error. In gmail-import.rb, line 158 was throwing the error. The above code has

index.store :message, document

The following fixed it for me:

index.store :message => document

@masukav
Copy link

masukav commented Apr 11, 2014

Returns TypeError at /
no implicit conversion of nil into String for time.rb _parse.

d = Date._parse(date, comp)

How can this be avoided?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment