Skip to content

Instantly share code, notes, and snippets.

@renoirb
Last active January 21, 2024 20:38
Show Gist options
  • Save renoirb/8d255b9b4a90e83332134dd63af34d1f to your computer and use it in GitHub Desktop.
Save renoirb/8d255b9b4a90e83332134dd63af34d1f to your computer and use it in GitHub Desktop.
Extract data from Proton Mail
/**
* Extract all messages from left panel
*
* We can click on "next" page, and re-run
*/
/**
* .item-firstline > .item-senders > [data-testid="message-column:sender-address"][title],
* .item-firstline > .item-senders > [data-testid="message-column:sender-address"].textContent,
*
* .item-secondline > .item-subject > [data-testid="message-column:subject"][title][role="heading"],
* .item-secondline > .item-icons > .item-meta-infos > ul > li,
*/
var LINES = [
[
// Yesterday
'sentDateText',
'.delight-item-firstline',
'.delight-item-firstline-infos time[datetime]',
(n) => n.textContent,
],
[
// Thursday, August 19th, 2021 at 20:11
'sentDateTime',
'.delight-item-firstline',
'.delight-item-firstline-infos time[datetime]',
(n) => n.getAttribute('datetime'),
],
[
'senderEmails',
'.delight-item-firstline',
'.item-senders [data-testid="message-column:sender-address"][title]',
(n) => n.getAttribute('title').split(', '),
],
[
'senderNames',
'.delight-item-firstline',
'.item-senders [data-testid="message-column:sender-address"][title]',
(n) => n.textContent.split(','),
],
[
'subject',
'.delight-item-secondline',
'.item-subject [role="heading"][data-testid="message-column:subject"]',
(n) => n.textContent,
],
[
'folder',
'.delight-item-secondline',
// '.item-subject > span:not([role="heading"])' /* To get one or many, but would need more work */,
'.item-subject span.flex span[data-testid^="item-location"]' /* Just get the first one, even though there can be many*/,
(n) => {
let folder = null
try {
const testing = n && n.hasAttribute('data-testid') ? n?.textContent : '';
folder = testing
} catch (e) {
// nothing
throw new Error(e)
}
console.log('folder', { element: n, folder })
return folder
},
],
/**
* This won't work, the DOM does not have a list of labels, only the first
*
* ['labels', '.item-secondline > .item-icons > .item-meta-infos > ul', (n) => Array.from(n.childNodes);],
*/
]
let MESSAGE_LIST_PARENT_SELECTOR = '.delight-items-column-list-inner.delight-items-column-list-inner--mail .delight-items-column-list-container div'
let rows = []
let MESSAGE_LIST = document.querySelector(MESSAGE_LIST_PARENT_SELECTOR)
let tryMessageListParentSelector = (selector) => {
const parent = document.querySelectorAll(selector ?? MESSAGE_LIST_PARENT_SELECTOR)[0] ?? []
const shouldBeNonZero = Array.isArray(parent) ? [].length : parent.childNodes.length
if (shouldBeNonZero > 0) {
MESSAGE_LIST = document.querySelector(selector)
MESSAGE_LIST_PARENT_SELECTOR = selector
} else {
const message = `Selector did not find a div with many children for messages`
throw new Error(message)
}
console.log('tryMessageListParent', { selector, parent, shouldBeNonZero, passed: shouldBeNonZero > 0 })
}
tryMessageListParentSelector(MESSAGE_LIST_PARENT_SELECTOR)
/**
* Run the following for each page. Click manually, then invoke this, filling the "rows" array.
*/
var appendToRows = () =>
Array.from(MESSAGE_LIST?.childNodes).forEach((MESSAGE_DOM_NODE, i) => {
console.debug(`row ${i}.0`, { MESSAGE_DOM_NODE, i })
const data = Object.create(null)
for (const [fieldName, rowSel, detailSel, closure] of LINES) {
console.debug(`\n\nrow ${fieldName} ${i}.`)
const rowNode = MESSAGE_DOM_NODE.querySelector(rowSel)
const localSel = `${MESSAGE_LIST_PARENT_SELECTOR} ${detailSel}`
console.debug(`row ${fieldName} ${i}.1`, { rowSel, localSel, detailSel, rowNode })
if (rowNode) {
const detailRowNode = rowNode.querySelector(detailSel) ?? null
console.debug(`row ${fieldName} ${i}.2`, { rowNode, detailRowNode, closure, aweile: detailRowNode !== null })
if (detailRowNode !== null) {
let value = null
try {
value = closure.call(rowNode, detailRowNode)
console.debug(`row ${fieldName} ${i}.2a`, { value })
} catch(e) {
// nothing
console.error(`Error at row ${fieldName} ${i}.3: ${e}`, { value })
value = null
}
Object.assign(data, { [fieldName]: value })
console.debug(`row ${fieldName} ${i}.3`, { value })
}
} else {
const message = `Error at row ${i} for rowNode`
throw new Error(message)
}
}
rows.push(data)
})
/**
* From a message, click on "more" and "message headers"
*
* Run this command.
*/
var extractEmailHeaders = (selector = 'pre') => {
const elRef = document.querySelector(selector)
const headers = new Map()
const eachLineHeaderNameParts = []
var extractHeaderName = (input) => {
// Support when there's maybe more than one ":"
// e.g. "Subject: Re: Fooo"
const splitted = input.split(':')
let headerName = splitted.length > 1 ? splitted[0] : ''
return headerName
}
if (elRef) {
const textContent = elRef.textContent
if (/^[A-Z-]+:/i.test(textContent)) {
let headerNameForContinuation = ''
const RE_HEADER_SEP = /\r?\n/
const lines = textContent.split(RE_HEADER_SEP)
for (let i = 0; i < lines.length; i++) {
const cur = lines[i]
const next = lines[i + 1]
if (cur === '') {
// Next line is an empty string,
// we are no longer in headers
break
}
let headerName = extractHeaderName(cur)
eachLineHeaderNameParts.push(headerName)
const curIsMultiLineContinuation = /^\s/.test(cur)
const nextIsMultiLine = /^\s/.test(next)
if (nextIsMultiLine) {
// TODO?
// Not supporting multi-line as of now
// Also not supporting when more than one header with same name.
if (!curIsMultiLineContinuation) {
headerNameForContinuation = headerName
}
} else {
if (!curIsMultiLineContinuation) {
headerNameForContinuation = ''
if (!/^x-pm/i.test(headerName)) {
const headerData = cur.replace(`${headerName}: `, '')
headerName = headerName.toLocaleLowerCase().trim()
headers.set(headerName, headerData)
}
}
}
}
// console.log('eachLineHeaderNameParts', eachLineHeaderNameParts)
}
}
return headers
}
/**
* From the result of extractEmailHeaders,
* use this to format the headers you want to extract.
*/
var stringifyUsefulHeaders = (map) => {
const lines = []
const copy = new Map(map)
const addLine = (headerName) => {
const headerData = copy.get(headerName)
if (headerData) {
copy.delete(headerName)
lines.push(`${headerName}: ${headerData}`)
}
}
addLine('from')
addLine('subject')
addLine('to')
addLine('reply-to')
addLine('x-original-to')
addLine('delivered-to')
addLine('x-attached')
// for (const [headerName, headerData] of map) {
// console.log('stringifyUsefulHeaders', headerName, headerData)
// }
return lines.join('\n')
}
@renoirb
Copy link
Author

renoirb commented Mar 7, 2023

Process

  1. Create a label that we will use to regroup messages from many senders for banking institution, to social any provider. "Sender"
  2. Tag messages from each Sender, for each type of messages such as password change, important account change, to promotional messages, etc.
  3. Once you have many messages, click on the label to see all messages
  4. To extract data for each email (e.g. from, to, subject, etc.)
    1. "Extract from message list", copy the code and paste in browser Dev-Tools, then execute appendToRows() for each page.
    2. Review entries using rows in Dev-Tools
    3. Use copy(rows) in Dev-Tools and paste into an empty file the contents, that'll be useful to see the subject, sender, etc.
  5. Extract Email Headers
    1. Go "Extract from message headers", copy-paste the code
    2. On any email
    3. click on "more" and "message headers"
    4. In Dev-Tools, run extractEmailHeaders()
    5. To get common headers, assign to another variable var headers = extractEmailHeaders()
    6. Then run stringifyUsefulHeaders(headers)

With all of this, it should help to figure out all the variations of headers from the same sender but for different purposes.

Example output

Extract from message list

[
  {
    "email": [
        "support@other-example.com"
    ],
    "senderName": [
        "Other Example"
    ],
    "subject": "Login attempt from new device on Friday Feb 11 2023, 03:28PM EST",
    "folder": "Trash"
  },
  {
    "email": [
        "support@ui.com",
        "bob@example.com"
    ],
    "senderName": [
        "Ubiquiti Inc",
        "Bob Example"
    ],
    "subject": "Re: Access to UDM Pro at home through UI Account no lo...",
    "folder": "Trash"
  },
  {
    "email": [
        "no-reply-FooBarBazz@notifications.ui.com"
    ],
    "senderName": [
        "UniFi OS",
        "FooBarBazz"
    ],
    "subject": "New UniFi Login on FooBarBazz",
    "folder": "Some folder name"
  }
]

Extract from message headers

from: "UniFi OS, FooBarBazz" <no-reply-FooBarBazz@notifications.ui.com>
subject: New UniFi Login on FooBarBazz
to: bob@example.com
x-original-to: bob@example.com
delivered-to: bob@example.com
x-attached: unifi-logo-blue.png

@renoirb
Copy link
Author

renoirb commented Jan 21, 2024

jQ queries

Finding based on name in one of the senders

jq '.[] | select(.senderNames[0] | contains("amazon"))'

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment