Skip to content

Instantly share code, notes, and snippets.

@fabianem
Last active May 15, 2024 12:03
Show Gist options
  • Save fabianem/5012c0b9b44d0bd2c167ab7c81a82937 to your computer and use it in GitHub Desktop.
Save fabianem/5012c0b9b44d0bd2c167ab7c81a82937 to your computer and use it in GitHub Desktop.
Google Apps Script to count emails by sender in batches
// Google Apps Script for counting emails by sender in batches. Stopps and resumes after 5mins of processing to avoid reaching script runtime limit.
// 1. Allow required permissions
// 2. Add gmail service to project
// 3. Fill <your-email> in `runProcessor` function and adjust query if needed
// 4. Run function `runProcessor`
// 5. Result will be saved in a new spreadsheet
// Based on ideas from https://stackoverflow.com/a/59222719/5162536 and
// https://medium.com/geekculture/bypassing-the-maximum-script-runtime-in-google-apps-script-e510aa9ae6da
const execution_threshold = 4 * 60 * 1000 // 4 minutes in ms
const email_batch = 200 // amount of emails to list in one call
const progress_filename = 'emails-by-sender-processor.json'
const progress_fileid_property = 'emails-by-sender-processor-file-id'
const progress_status_property = 'emails-by-sender-processor-status'
const trigger_id_property = 'emails-by-sender-trigger-id'
class SenderList {
constructor(query, email) {
if (SenderList.instance) return SenderList.instance;
this.mailsBySender = new Map();
this._totalMessages = 0;
this._query = query;
this._pageToken = null;
this._email = email;
SenderList.instance = this;
return SenderList.instance;
}
get query() {
return this._query;
}
addSender(sender) {
if (!this.mailsBySender) {
this.mailsBySender = new Map();
}
if (!this.mailsBySender.get(sender)) {
this.mailsBySender.set(sender, 1)
} else {
this.mailsBySender.set(sender, this.mailsBySender.get(sender)+1)
}
return this;
}
getSenders() {
return this.mailsBySender;
}
addTotalMessages(count) {
if (!this._totalMessages) {
this._totalMessages = 0;
}
this._totalMessages += count;
return this;
}
set totalMessages(value) {
this._totalMessages = value;
return this;
}
get totalMessages() {
return this._totalMessages;
}
set pageToken(token) {
this._pageToken = token;
return this;
}
get pageToken() {
return this._pageToken;
}
get email() {
return this._email;
}
toJSON() {
return {
mailsBySender: JSON.stringify([...this.mailsBySender.entries()]),
totalMessages: this.totalMessages,
query: this.query,
pageToken: this.pageToken,
email: this.email,
};
}
import(json) {
this.mailsBySender = new Map(JSON.parse(json.mailsBySender));
this._totalMessages = json.totalMessages;
this._query = json.query;
this._pageToken = json.pageToken;
this._email = json.email;
return this;
}
}
class Timer {
start() {
this.start = Date.now();
}
getDuration() {
return Date.now() - this.start;
}
}
class ProcessorStatus {
static set(status) {
PropertiesService.getScriptProperties().setProperty(progress_status_property, status);
}
static get() {
return PropertiesService.getScriptProperties().getProperty(progress_status_property);
}
}
class Trigger {
constructor(functionName, minutesTick) {
let trigger = ScriptApp.newTrigger(functionName).timeBased().everyMinutes(minutesTick).create();
PropertiesService.getScriptProperties().setProperty(trigger_id_property, trigger.getUniqueId());
return trigger;
}
static isAlreadyCreated() {
let triggerId = PropertiesService.getScriptProperties().getProperty(trigger_id_property);
if (triggerId) {
let triggers = ScriptApp.getProjectTriggers();
let existingTrigger = triggers.find((trigger) => trigger.getUniqueId() === triggerId && trigger.getHandlerFunction() === 'runProcessor');
if (existingTrigger) return true;
}
return false;
}
static deleteTrigger(e) {
if (typeof e !== 'object') return Logger.log(`${e} is not an event object`);
if (!e.triggerUid) return Logger.log(`${JSON.stringify(e)} doesnt have a triggerUid`);
ScriptApp.getProjectTriggers().forEach(trigger => {
if (trigger.getUniqueId() === e.triggerUid) return ScriptApp.deleteTrigger(trigger);
});
}
}
function runProcessor(e) {
try {
var timer = new Timer();
timer.start();
if (ProcessorStatus.get() === 'running') return Logger.log("exiting because processor already running");
ProcessorStatus.set('running')
let query = "in: inbox"; // query for filtering emails
let email = '<your-email>' // your email here
var senderList = new SenderList(query, email);
const existingProgressFileId = PropertiesService.getScriptProperties().getProperty(progress_fileid_property);
if (existingProgressFileId) {
Logger.log('resuming processing...');
let json = readProgressFile(existingProgressFileId);
if (json) {
senderList = senderList.import(json);
} else {
Logger.log(`progress file with id ${existingProgressFileId} does not exist, starting processing from beginning...`)
PropertiesService.getScriptProperties().deleteProperty(progress_fileid_property);
}
}
Logger.log(`starting processing sender list for query '${senderList.query}' and email '${senderList.email}'. Current total messages processed: ${senderList.totalMessages}`)
return processSenderList(senderList, timer, e);
} catch(e) {
Logger.log(`error occured while processing, setting processor status to failed: ${e}`);
return ProcessorStatus.set('failed');
}
}
function processSenderList(senderList, timer, e) {
do {
if (timer.getDuration() >= execution_threshold) {
Logger.log('stopping because execution threshold reached, saving progress and creating trigger...');
writeProgressFile(senderList);
ProcessorStatus.set('not running');
if (!Trigger.isAlreadyCreated()) new Trigger('runProcessor', 1);
return;
}
var result = Gmail.Users.Messages.list(senderList.email, { maxResults: email_batch, pageToken: senderList.pageToken, q: senderList.query });
if (result.messages) {
for (var i = 0; i < result.messages.length; i++) {
var sender = GmailApp.getMessageById(result.messages[i].id).getFrom();
senderList.addSender(sender);
senderList.addTotalMessages(1);
}
Logger.log('current total messages: ' + senderList.totalMessages)
}
senderList.pageToken = result.nextPageToken
} while (senderList.pageToken);
Logger.log('finished, got total messages: ' + senderList.totalMessages)
writeSenderList(senderList);
cleanUp(e);
}
function cleanUp(e) {
Logger.log('cleaning up...')
Trigger.deleteTrigger(e);
const existingProgressFileId = PropertiesService.getScriptProperties().getProperty(progress_fileid_property);
if (existingProgressFileId) {
let file = DriveApp.getFileById(existingProgressFileId);
if (file.getName() === progress_filename) {
file.setTrashed(true);
} else {
Logger.log(`not deleting progress file because unexpected name of file: '${file.getName()}'`)
}
}
PropertiesService.getScriptProperties().deleteProperty(progress_fileid_property);
PropertiesService.getScriptProperties().deleteProperty(progress_status_property);
PropertiesService.getScriptProperties().deleteProperty(trigger_id_property);
}
function writeProgressFile(senderList) {
let file;
const existingFileId = PropertiesService.getScriptProperties().getProperty(progress_fileid_property);
if (existingFileId) {
file = DriveApp.getFileById(existingFileId);
} else {
file = DriveApp.createFile(progress_filename, '');
PropertiesService.getScriptProperties().setProperty(progress_fileid_property, file.getId());
}
file.setContent(JSON.stringify(senderList));
}
function readProgressFile(fileId) {
try {
let file = DriveApp.getFileById(fileId);
if (file.isTrashed()) return null;
return JSON.parse(file.getBlob().getDataAsString());
} catch (e) {
Logger.log(`warning: failed to open progress file with id ${fileId}: ${e}`);
return null;
}
}
function writeSenderList(senderList) {
var sender_array = Array.from(senderList.getSenders(), ([sender, count]) => ([ sender, count ]));
var ss = SpreadsheetApp.create(`Gmail count emails by sender for query '${senderList.query}' (${new Date()})`);
var sh = ss.getActiveSheet()
sh.clear();
Logger.log(`writing to spreadsheet '${ss.getSheetName()}' of document '${ss.getName()}'`)
sh.appendRow(['Email Address', 'Count']);
sh.getRange(2, 1, sender_array.length, 2).setValues(sender_array).sort({ column: 2, ascending: false });
}
@syeats
Copy link

syeats commented Jan 3, 2024

Thanks for the great code, i ran on my gmail inbox (25,000 emails) -the first run went well and processed around 18000 in 6 minutes.
I then ran the function again, and after 5 minutes, it gave this error;
12:05:01 AM Info error occured while processing, setting processor status to failed: Exception: Service invoked too many times for one day: gmail

Can you confirm or not that with a free gmail I might only get around 12 mins a day?
Should i continue to run the function again tomorrow and see if it completes?
thanks.

@kitenski
Copy link

kitenski commented Jan 15, 2024

I'm trying to get a handle on my inbox, this runs but spits out an error as follows:

Execution log
1:49:46 PM Notice Execution started
1:49:47 PM Info starting processing sender list for query 'in: inbox' and email 'me@gmail.com'. Current total messages processed: 0
1:51:17 PM Info current total messages: 500
1:52:48 PM Info current total messages: 1000
1:54:19 PM Info current total messages: 1500
1:55:47 PM Error
Exceeded maximum execution time

@fabianem
Copy link
Author

Thanks for the great code, i ran on my gmail inbox (25,000 emails) -the first run went well and processed around 18000 in 6 minutes. I then ran the function again, and after 5 minutes, it gave this error; 12:05:01 AM Info error occured while processing, setting processor status to failed: Exception: Service invoked too many times for one day: gmail

Can you confirm or not that with a free gmail I might only get around 12 mins a day? Should i continue to run the function again tomorrow and see if it completes? thanks.

I'm sorry for not getting back to you sooner.
This error comes from exceeding the daily limit of the Gmail API read method.
As far as I know, the daily quota for reading emails via Gmail API is 20000/day. (Quotas for Google Services)

Currently, the script does not support this kind of error, which means it doesn't save the progress correctly after the error occurred and it might lead to incorrect counting of emails.

@syeats Do you still need this?
I might try to find some time in the next days to implement better handling for this kind of error in the script.

@fabianem
Copy link
Author

fabianem commented Jan 21, 2024

I'm trying to get a handle on my inbox, this runs but spits out an error as follows:

Execution log 1:49:46 PM Notice Execution started 1:49:47 PM Info starting processing sender list for query 'in: inbox' and email 'me@gmail.com'. Current total messages processed: 0 1:51:17 PM Info current total messages: 500 1:52:48 PM Info current total messages: 1000 1:54:19 PM Info current total messages: 1500 1:55:47 PM Error Exceeded maximum execution time

I ran into this issue myself, what helped is to reduce the execution_threshold to 4mins and email_batch to 200. (I updated the gist accordingly)
This way we have a bigger buffer if the listing and counting of one iteration takes some longer time and we should stop the processing gracefully and should be able to continue with the next run.

@kitenski Please let me know if you still experience this issue.

@leodevbro
Copy link

Hi @fabianem, my question is: If I receive one or more emails in my inbox during the fetching process with your code, will it have batching mistakes? I mean, does the paging (200 mails per page) start from the newest (or oldest?) mail (from N1 to N200, then from N201 to N400 and so on)? If it is from newest, then if I receive one or more mails during the paging process, it means, the paging numbering will have some offset, and so it will have some incorrections: It will skips some messages, and/or it will duplicate some messages, am I correct?

@kitenski
Copy link

@kitenski Please let me know if you still experience this issue.

It ran a bit longer but still bombed out eventually @fabianem,

8:48:22 AM Info starting processing sender list for query 'in: inbox' and email 'me@gmail.com'. Current total messages processed: 0
8:48:51 AM Info current total messages: 200
8:49:20 AM Info current total messages: 400
8:49:48 AM Info current total messages: 600
8:50:16 AM Info current total messages: 800
8:50:44 AM Info current total messages: 1000
8:51:12 AM Info current total messages: 1200
8:51:39 AM Info current total messages: 1400
8:52:10 AM Info current total messages: 1600
8:52:42 AM Info current total messages: 1800
8:52:42 AM Info stopping because execution threshold reached, saving progress and creating trigger...

I upped the execution threshold to 10 minutes then got this error:

8:59:48 AM Info current total messages: 1600
9:00:24 AM Info current total messages: 1800
9:01:01 AM Info current total messages: 2000
9:01:06 AM Error
Exceeded maximum execution time

@leodevbro
Copy link

Here is my log:

Execution log
2:03:23 PM	Notice	Execution started
2:03:40 PM	Info	starting processing sender list for query 'in: inbox' and email 'leodevbro@gmail.com'. Current total messages processed: 0
2:04:05 PM	Info	current total messages: 200
2:04:31 PM	Info	current total messages: 400
2:04:57 PM	Info	current total messages: 600
2:05:23 PM	Info	current total messages: 800
2:05:49 PM	Info	current total messages: 1000
2:06:14 PM	Info	current total messages: 1200
2:06:39 PM	Info	current total messages: 1400
2:07:05 PM	Info	current total messages: 1600
2:07:29 PM	Info	current total messages: 1800
2:07:54 PM	Info	current total messages: 2000
2:07:54 PM	Info	stopping because execution threshold reached, saving progress and creating trigger...
2:07:41 PM	Notice	Execution completed

@leodevbro
Copy link

Also, my question is: do you clean up these sender addresses? because sometimes the same sender can have two different notation string, well, yeah, inside the string the actual email address is the same always, but sometimes the notation has additional information (for example name), sometimes not, so, does your code treat these as different senders? If that's so, than it means the result from this code is not 100% accurate.

image

@fabianem
Copy link
Author

Hi @fabianem, my question is: If I receive one or more emails in my inbox during the fetching process with your code, will it have batching mistakes? I mean, does the paging (200 mails per page) start from the newest (or oldest?) mail (from N1 to N200, then from N201 to N400 and so on)? If it is from newest, then if I receive one or more mails during the paging process, it means, the paging numbering will have some offset, and so it will have some incorrections: It will skips some messages, and/or it will duplicate some messages, am I correct?

Hi @leodevbro, the fetching works like the default behavior when you query your emails in the gmail web app.
So, the call to Gmail.Users.Messages.list() returns the messages in descending order according to the provided query (the most recent message will be the first in the list response).
That means you are right, if during the fetching you receive some new emails (which the applied query would return), then you will miss those emails in the counting.

In my use case that small offset wasn't a real problem - is that a problem in your use case?

@fabianem
Copy link
Author

@kitenski Please let me know if you still experience this issue.

It ran a bit longer but still bombed out eventually @fabianem,

8:48:22 AM Info starting processing sender list for query 'in: inbox' and email 'me@gmail.com'. Current total messages processed: 0 8:48:51 AM Info current total messages: 200 8:49:20 AM Info current total messages: 400 8:49:48 AM Info current total messages: 600 8:50:16 AM Info current total messages: 800 8:50:44 AM Info current total messages: 1000 8:51:12 AM Info current total messages: 1200 8:51:39 AM Info current total messages: 1400 8:52:10 AM Info current total messages: 1600 8:52:42 AM Info current total messages: 1800 8:52:42 AM Info stopping because execution threshold reached, saving progress and creating trigger...

I upped the execution threshold to 10 minutes then got this error:

8:59:48 AM Info current total messages: 1600 9:00:24 AM Info current total messages: 1800 9:01:01 AM Info current total messages: 2000 9:01:06 AM Error Exceeded maximum execution time

@kitenski , that's strange.
Would you be able to provide full logs for the run when the error occurred?

You shouldn't increase the execution threshold, definitely not above 6 minutes because 6 minutes is the limit for the script runtime set by Google - after that time the script will be stopped.
We need some buffer to stop gracefully before reaching this hard limit, so I wouldn't choose 6 minutes either for my threshold.

You could try the opposite, for example, a threshold of 2 or 3 minutes, that would give you a good amount of buffer to stop gracefully so the processing can be continued with the next run. Additionally you could also lower the batch size from 200 to 100:

const execution_threshold = 3 * 60 * 1000 // 4 minutes in ms
const email_batch = 100 // amount of emails to list in one call

If possible, please provide full logs of the run where the error happens

@fabianem
Copy link
Author

Also, my question is: do you clean up these sender addresses? because sometimes the same sender can have two different notation string, well, yeah, inside the string the actual email address is the same always, but sometimes the notation has additional information (for example name), sometimes not, so, does your code treat these as different senders? If that's so, than it means the result from this code is not 100% accurate.

image

@leodevbro well, that depends on how you define accuracy here...
Currently, the messages are being grouped by the From value which is defined inside the headers field of the payload of the message response. The From value is being used as-is, there isn't a clean-up happening becase I didn't need that in my use case.

For sure some clean-up of the From value could be added to the script.
Do you need this for your use case @leodevbro ?

@leodevbro
Copy link

In my use case that small offset wasn't a real problem - is that a problem in your use case?

For sure some clean-up of the From value could be added to the script. Do you need this for your use case @leodevbro ?

Hello @fabianem, huge thanks for the quick responses. Your work is very good but I am dealing with a very large mailbox, like 100K mails in one Gmail account, maybe 200K, and the online method (Apps Script and/or Gmail API method) seems totally unable to deal with it, becuase Google has daily limitations of api calls, and even if it did not have daily limits, it is very slow, it takes several minutes to fetch 1000 mails, so, for 100K, it will take 3-4-5 hours, maybe more. Of course I mean the Google online execution system is slow, not your code itself. So, well, I decided to look into the offline solution, with Google Takeout mbox file. I tried this method:

https://stackoverflow.com/questions/77176245/node-js-optimal-way-to-handle-await-inside-the-stream-data-listener-mbox-emai

And it seems working fine, it takes only 5-10 seconds to analyze 1000 mails. So, for 100K, it will take less than 10-20 minutes. Also it has built-in address-cleanup feature, so I can read both: cleaned address and full sender info text (non-cleaned). And, fore sure, since it is an offline mbox file, we don't have paging incorrection problem, and actually, probably we don't need paging at all, because even for 200K mails (which will be probably 20 GB mbox file), offline analyzation seems very fast.

I am going to modify the offline method code to be able to have simple npm commands to do many kinds of analyzation of mbox file. Also I am going to publish it as an npm package as a simple CLI tool. Probably after several days.

@fabianem
Copy link
Author

@leodevbro that's indeed a huge mailbox 😄 - you are right, unfortunately for a free account the daily limit to list messages via API is 20k. Even with a workaccount it's only 50k/day.

Actually, I didn't know about the Google Takeout tool, thanks for sharing that.
I guess with this solution you could also miss some new emails you would receive after exporting the mbox file.
Your idea definitely sounds interesting so I hope it works out for you!

@syeats
Copy link

syeats commented Feb 16, 2024 via email

@leodevbro
Copy link

leodevbro commented May 15, 2024

Here is the video tutorial: how to find senders which sent you most mails in Gmail. With NodeJS method.

https://www.youtube.com/watch?v=KKU84ogffeM

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment