Skip to content

Instantly share code, notes, and snippets.

@markcmiller86
Last active August 31, 2021 12:28
Show Gist options
  • Save markcmiller86/e84c967246b603ffafa30558882bedbd to your computer and use it in GitHub Desktop.
Save markcmiller86/e84c967246b603ffafa30558882bedbd to your computer and use it in GitHub Desktop.
Python3 GraphQl script to import an email archive to GitHub discussions (rev 4)
# Copyright (c) 2021, Lawrence Livermore National Security, LLC
#
# Python3 script using GraphQL interface to GitHub discussions to read
# .txt file mbox files of VisIt User's email archive and import each
# email thread as a GitHub discussion.
#
# 1. Reads a directory of .txt files in mbox format for messages (readAllMboxFiles)
# 2. Removes duplicate messages (removeDateAndSubjectDups) which have
# identical dates and highly similar subjects.
# 3. Threads messages using filtered email subjects and date proximity
# (threadMessages) in a dict of threads keyed by subject. Also wholly
# removes some selection of message topics.
# 4. Removes various bad cases (removeBadMessages).
# 5. Reads your GitHub token from a .txt file `ghToken.txt`
# 6. Iterates over threads using GraphQl interface to create a new
# discussion for each thread and adding remaining messages in threads
# as comments.
# 7. Locks each resulting discussion
# 8. Applies a label to resulting discussion
# 9. Throttles (inserts sleeps) to stay within GitHub GraphQl rate
# limits (throttleRate)
# 10. Maintains knowledge of state to safely restart and pick up
# where it left off
# 11. Logs failure cases to a message log (text) file
# 12. In addition to GraphQl work, this script will create two files
# - email2discussions-restart.txt: list of subject keys successfully processed
# - email2discussions-failures-log.txt: graphql failures
# 13. You will need to modify data on these lines...
# Params to threadMessages algorithm to control date proximity,
# subject similarity, subject uniqeness
# line 39: rootDir containing all mbox .txt files
# line 46: you may need to add time-zone names
# line 158-166: Logic to ignore certain email subjects.
# function filterSubject: which filters email subject lines
# function filterBody: whith filters email body
# lines 927-end: GitHub org/user name, repo names, label names,
# discussion names...
#
# Programmer: Mark C. Miller, Tue Jul 20 10:21:40 PDT 2021
#
import datetime, email.header, glob, mailbox, os, pytz
import re, requests, shutil, sys, textwrap, time
from difflib import SequenceMatcher
# directory containing all the .txt files from the email archive
#rootDir = "/Users/miller86/visit/visit-users-email"
rootDir = "/Users/miller86/visit/visit-developers-email"
#
# Smarter datetime func that culls time-zone name if present and
# normalizes all datetimes to pacific (current) time zone
# CEST and EEST are handled as only first 3 letters
#
tzNames = [ \
'GMT', 'UTC', 'ECT', 'EET', 'ART', 'EAT', 'MET', 'NET', 'PLT', 'IST', 'BST', 'VST',
'CTT', 'JST', 'ACT', 'AET', 'SST', 'NST', 'MIT', 'HST', 'AST', 'PST', 'PNT', 'MST',
'CST', 'EST', 'IET', 'PRT', 'CNT', 'AGT', 'BET', 'CAT', 'CET', 'PDT', 'EDT', 'CES',
'KST', 'MSK', 'EES', 'MDT', 'CDT', 'SGT', 'AKD', 'US/']
def mydt(d):
if not d:
return datetime.datetime.now().astimezone()
tzn = d.split()[-1][1:4]
if tzn in tzNames:
d = " ".join(d.split()[:-1]) # removes the last term (time zone name)
try:
return datetime.datetime.strptime(d,'%a, %d %b %Y %H:%M:%S %z').astimezone()
except:
try:
return datetime.datetime.strptime(d,'%a, %d %b %Y %H:%M %z').astimezone()
except:
return datetime.datetime.strptime(d,'%a, %d %b %Y %H:%M:%S').astimezone()
#
# Iterate over all files loading each as a mailbox and catenating the
# items together into one long list. Sort resulting list by date
#
def readAllMboxFiles():
print("Reading messages...")
files = glob.glob("%s/*.txt"%rootDir)
files = sorted(files, key=lambda f: datetime.datetime.strptime(f,rootDir+'/%Y-%B.txt'))
items = []
for f in files:
mb = mailbox.mbox(f)
items += mb.items()
print(" read %03d items from file \"%s\" "%(len(mb.items()),f),end='\r')
print("\n%d messages read"%len(items))
print("Sorting messages by date...")
sitems = sorted(items, key=lambda m: mydt(m[1]['Date']))
print("Done")
return sitems
#
# If adjacent items have identical dates and highly similar subjects
# they are probably dups. Remove them now.
#
def removeDateAndSubjectDups(sitems):
i = 0
count = len(sitems)-1
rmcount = 0
while i < count:
curdate = mydt(sitems[i][1]['Date'])
cursubj = filterSubject(sitems[i][1]['Subject'])
j = i+1
while (j < count) and (mydt(sitems[j][1]['Date']) == curdate):
cksubj = filterSubject(sitems[j][1]['Subject'])
r = SequenceMatcher(None, cursubj, cksubj).ratio()
if r > 0.6:
del sitems[j]
count -= 1
rmcount += 1
else:
j += 1
i += 1
print("Removed %d dups with equal datetimes and similar subjects"%rmcount)
return sitems
#
# Thread messages keyed by filtered subject and within 90 days of
# each other. When an exact subject match fails, try a similarity
# match before giving up and creating a new thread.
#
# - timeDeltaDays is how wide a window, in days, is used to match
# similar (not identical) subjects
# - similarityThreshold is the minimum similarity ratio (defined by
# sequence-matcher) to consider two different subjects the same.
# The default value of 0.6 is that recommended by SequenceMatcher docs.
# - uniqSubjLen is the minimum length of a subject, in characters,
# that matching instances more than timeDeltaDays apart are still
# considered part of the *same* thread.
#
def threadMessages(sitems, timeDeltaDays=90, similarityThreshold=0.6, uniqSubjLen=25):
print("Threading...")
threads = {}
recentSubjects = []
recentDates = []
coincidentalSubjectCounts = {}
for i in range(len(sitems)):
# Dereference the current message
msg = sitems[i][1]
# Output progress information
p = int(100*float(i)/len(sitems))
print(" %02d %% completed, recent subjects count=%03d"%(p,len(recentSubjects)), end='\r')
# Ignore messgaes that appear to be GitHub related
# notifications using raw (unfiltered) subject
sub = msg['Subject']
if sub and 'visit-dav' in sub:
continue
if msg['In-Reply-To'] and 'visit-dav' in msg['In-Reply-To']:
continue
if msg['References'] and 'visit-dav' in msg['References']:
continue
curdate = mydt(msg['Date'])
cursubj = filterSubject(msg['Subject'])
# Ignore messages with subjects that appear to be announcements of releases
# or subversion commit/update messages or test suite runs
if re.match('visit [0-9]*.[0-9]*.[0-9]* released', cursubj):
continue
if 'svn' in cursubj and 'update' in cursubj:
continue
if 'svn' in cursubj and 'commit' in cursubj:
continue
if 'test suite run' in cursubj and \
('passed' in cursubj or 'failed' in cursubj):
continue
# Keep recent subjects/dates up to date by deleting entries (at beginning)
# timeDeltaDays days older than current. This works because the input messages
# list is already sorted by date.
while recentDates and \
(curdate - recentDates[0]) > datetime.timedelta(days=timeDeltaDays):
del recentDates[0]
del recentSubjects[0]
# Try exact match first
if cursubj in recentSubjects:
threads[cursubj] += [msg]
idx = recentSubjects.index(cursubj)
del recentSubjects[idx]
del recentDates[idx]
recentSubjects += [cursubj]
recentDates += [curdate]
continue
# Try exact match on modified subject if this a subject
# for which we have multiple threads
if cursubj in coincidentalSubjectCounts.keys():
cursubj1 = cursubj + "~%d"%coincidentalSubjectCounts[cursubj]
if cursubj1 in recentSubjects:
threads[cursubj1] += [msg]
idx = recentSubjects.index(cursubj1)
del recentSubjects[idx]
del recentDates[idx]
recentSubjects += [cursubj1]
recentDates += [curdate]
continue
# Ok, try fuzzy match by taking the *first* maximum-ratio match
# for which that maximum ratio exceeds our similarity threshold
ratios = [SequenceMatcher(None, cursubj, recentSubjects[j]).ratio() for j in range(len(recentSubjects))]
maxr = max(ratios) if ratios else 0
if maxr > similarityThreshold:
maxi = ratios.index(maxr)
cursubj1 = recentSubjects[maxi]
threads[cursubj1] += [msg]
del recentSubjects[maxi]
del recentDates[maxi]
recentSubjects += [cursubj1]
recentDates += [curdate]
continue
#
# Looks like a new thread. However, it could coincidentally have a
# subject identical to a more than timeDeltaDays day old thread. This
# is common for subjects like "question" or "help" or "no subject", etc.
# The more characters there are in a thread subject, the more unique
# it is and so identical matches here for longer subject names are more
# than likely part of the same thread even if separated in time more
# than timeDeltaDays days.
#
if cursubj in threads.keys():
if len(cursubj) > uniqSubjLen:
# add to existing thread
threads[cursubj] += [msg]
try:
idx = recentSubjects.index(cursubj)
del recentSubjects[idx]
del recentDates[idx]
except ValueError:
pass
recentSubjects += [cursubj]
recentDates += [curdate]
continue
else:
# This logic deals with possible same subject separated in time
# by more than our timeDeltaDays threshold. Encountering the same
# subject more than timeDeltaDays days later is treated as a *new*
# thread of that subject so we append a number/count to the subject.
if cursubj in coincidentalSubjectCounts.keys():
coincidentalSubjectCounts[cursubj] += 1
else:
coincidentalSubjectCounts[cursubj] = 1
cursubj1 = cursubj + "~%d"%coincidentalSubjectCounts[cursubj]
# start a new thread on this modified subject
threads[cursubj1] = [msg]
recentSubjects += [cursubj1]
recentDates += [curdate]
continue
# start a *new* thread
threads[cursubj] = [msg]
recentSubjects += [cursubj]
recentDates += [curdate]
print("\nDone")
print("Subjects with multiple instances in time...")
for s in coincidentalSubjectCounts.keys():
print(" %d distinct threads with subject \"%s\""%(coincidentalSubjectCounts[s],s))
return threads
#
# Remove certain bad cases
#
def removeBadMessages(msgLists):
if None in msgLists.keys():
print("There are", len(msgLists[None]), "messages with subject = None")
del msgLists[None]
if '' in msgLists.keys():
print("There are", len(msgLists['']), "messages with subject = ''")
del msgLists['']
delItems = []
for k in msgLists.keys():
if len(msgLists[k]) <= 1:
delItems += [k]
print("Deleting %d threads of size <= 1"%len(delItems))
for i in delItems:
del msgLists[i]
#
# Debug: list all the subject lines
#
def debugListAllSubjects(msgLists):
for k in msgLists.keys():
mlist = msgLists[k]
print("%d messages for subject = \"%s\"\n"%(len(mlist),k))
#
# Debug: print some diagnostics info
#
def printDiagnostics(msgLists):
nummsg = 0
maxlen = 0
maxmsglen = 0
maxth = None
maxmsgth = None
for k in msgLists:
th = msgLists[k]
for m in th:
msglen = len(m.get_payload())
if msglen > maxmsglen:
maxmsglen = msglen
maxmsgth = th
l = len(th)
if l > maxlen:
maxlen = l
maxth = th
nummsg += l
print("Total threaded messages = %d"%nummsg)
print("Total threads = %d"%len(msgLists))
print("Max thread length = %d with subject..."%maxlen)
print(" \"%s\""%filterSubject(maxth[0]['Subject']))
print("Max message body size = %d"%maxmsglen)
print(" \"%s\""%filterSubject(maxmsgth[0]['Subject']))
#
# Capture failure details to a continuously appending file
#
def captureGraphQlFailureDetails(gqlQueryName, gqlQueryString, gqlResultString):
with open("email2discussions-failures-log.txt", 'a') as f:
f.write("%s - %s\n"%(datetime.datetime.now().strftime('%y%b%d %I:%M:%S'),gqlQueryName))
f.write("--------------------------------------------------------------------------\n")
f.write(gqlResultString)
f.write("\n")
f.write("--------------------------------------------------------------------------\n")
f.write(gqlQueryString)
f.write("\n")
f.write("--------------------------------------------------------------------------\n\n\n\n")
#
# Read token from 'ghToken.txt'
#
def GetGHToken():
if not hasattr(GetGHToken, 'ghToken'):
try:
with open('ghToken.txt', 'r') as f:
GetGHToken.ghToken = f.readline().strip()
except:
raise RuntimeError('Put a GitHub token in \'ghToken.txt\' readable only by you.')
return GetGHToken.ghToken
#
# Build standard header for URL queries
#
headers = \
{
'Content-Type': 'application/json',
'Authorization': 'bearer %s'%GetGHToken(),
'GraphQL-Features': 'discussions_api'
}
#
# Workhorse routine for performing a GraphQL query
# # A simple function to use requests.post to make the API call. Note the json= section.
def run_query(query):
if not hasattr(run_query, 'numSuccessiveFailures'):
run_query.numSuccessiveFailures = 0;
# Post the request. Check for possible error return and sleep and retry if so.
try:
request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
result = request.json()
i = 0
while 'errors' in result and i < 100:
print("....retrying \"%s\" after sleeping 3 seconds"%query[:30])
time.sleep(3)
request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
result = request.json()
i = i + 1
if 'errors' in result and i == 100:
raise Exception(">100 successive query failures, exiting...")
sys.exit(1)
run_query.numSuccessiveFailures = 0
except:
captureGraphQlFailureDetails('run_query', query, "")
run_query.numSuccessiveFailures += 1
if run_query.numSuccessiveFailures > 3:
raise Exception(">3 successive query failures, exiting...")
sys.exit(1)
if request.status_code == 200:
return request.json()
else:
raise Exception("run_query failed with code of {}. {} {}".format(request.status_code, query, request.json()))
#
# A method to periodically call to ensure we don't
# exceed GitHub's rate limits. It costs us part of our limit to
# call this so we don't want to call it all the time. It is coded
# to do *real* GraphQl work only once per minute, no matter how
# often it is actually called.
#
def throttleRate():
# set the *last* check 61 seconds in the past to force a check
# the very *first* time we run this
if not hasattr(throttleRate, 'lastCheckNow'):
throttleRate.lastCheckNow = datetime.datetime.now()-datetime.timedelta(seconds=61)
query = """
query
{
viewer
{
login
}
rateLimit
{
limit
remaining
resetAt
}
}
"""
# Perform this check only about once a minute
now = datetime.datetime.now()
if (now - throttleRate.lastCheckNow).total_seconds() < 60:
return
throttleRate.lastCheckNow = now
try:
result = run_query(query)
zuluOffset = 7 * 3600 # subtract PDT timezone offset from Zulu
if 'errors' in result.keys():
toSleep = (throttleRate.resetAt-now).total_seconds() - zuluOffset + 1
print("Reached end of available queries for this cycle. Sleeping %g seconds..."%toSleep)
time.sleep(toSleep)
return
# Gather rate limit info from the query result
limit = result['data']['rateLimit']['limit']
remaining = result['data']['rateLimit']['remaining']
# resetAt is given in Zulu (UTC-Epoch) time
resetAt = datetime.datetime.strptime(result['data']['rateLimit']['resetAt'],'%Y-%m-%dT%H:%M:%SZ')
toSleep = (resetAt-now).total_seconds() - zuluOffset
print("GraphQl Throttle: limit=%d, remaining=%d, resetAt=%g seconds"%(limit, remaining, toSleep))
# Capture the first valid resetAt point in the future
throttleRate.resetAt = resetAt
if remaining < 200:
print("Reaching end of available queries for this cycle. Sleeping %g seconds..."%toSleep)
time.sleep(toSleep)
except:
captureGraphQlFailureDetails('rateLimit', query, "")
#
# Get various visit-dav org. repo ids. Caches results so that subsequent
# queries don't do any graphql work.
#
def GetRepoID(orgname, reponame):
query = """
query
{
repository(owner: \"%s\", name: \"%s\")
{
id
}
}
"""%(orgname, reponame)
if not hasattr(GetRepoID, reponame):
result = run_query(query)
# result = {'data': {'repository': {'id': 'MDEwOlJlcG9zaXRvcnkzMjM0MDQ1OTA='}}}
setattr(GetRepoID, reponame, result['data']['repository']['id'])
return getattr(GetRepoID, reponame)
#
# Get object id by name for given repo name and org/user name.
# Caches reponame/objname pair so that subsequent queries don't do any
# graphql work.
#
def GetObjectIDByName(orgname, reponame, gqlObjname, gqlCount, objname):
query = """
query
{
repository(owner: \"%s\", name: \"%s\")
{
%s(first:%d)
{
edges
{
node
{
description,
id,
name
}
}
}
}
}
"""%(orgname, reponame, gqlObjname, gqlCount)
if not hasattr(GetObjectIDByName, "%s.%s"%(reponame,objname)):
result = run_query(query)
# result = d['data']['repository']['discussionCategories']['edges'][0] =
edges = result['data']['repository'][gqlObjname]['edges']
for e in edges:
if e['node']['name'] == objname:
setattr(GetObjectIDByName, "%s.%s"%(reponame,objname), e['node']['id'])
break
return getattr(GetObjectIDByName, "%s.%s"%(reponame,objname))
#
# Create a discussion and return its id
#
def createDiscussion(repoid, catid, subject, body):
query = """
mutation
{
createDiscussion(input:
{
repositoryId:\"%s\",
categoryId:\"%s\"
title:\"%s\",
body:\"%s\"
})
{
discussion
{
id
}
}
}
"""%(repoid, catid, subject, body)
try:
result = run_query(query)
# {'data': {'createDiscussion': {'discussion': {'id': 'MDEwOkRpc2N1c3Npb24zNDY0NDI1'}}}}
return result['data']['createDiscussion']['discussion']['id']
except:
captureGraphQlFailureDetails('createDiscussion', query,
repr(result) if 'result' in locals() else "")
return None
#
# Add a comment to a discussion
#
def addDiscussionComment(discid, body):
query = """
mutation
{
addDiscussionComment(input:
{
discussionId:\"%s\",
body:\"%s\"
})
{
comment
{
id
}
}
}
"""%(discid, body)
try:
result = run_query(query)
# {'data': {'addDiscussionComment': {'comment': {'id': 'MDE3OkRpc2N1c3Npb25Db21tZW50MTAxNTM5Mw=='}}}}
return result['data']['addDiscussionComment']['comment']['id']
except:
captureGraphQlFailureDetails('addDiscussionComment %s'%discid, query,
repr(result) if 'result' in locals() else "")
return None
#
# lock an object (primarily to lock a discussion)
#
def lockLockable(nodeid):
query = """
mutation
{
lockLockable(input:
{
clientMutationId:\"scratlantis:emai2discussions.py\",
lockReason:RESOLVED,
lockableId:\"%s\"
})
{
lockedRecord
{
locked
}
}
}"""%nodeid
try:
result = run_query(query)
except:
captureGraphQlFailureDetails('lockLockable %s'%nodeid, query,
repr(result) if 'result' in locals() else "")
#
# Add a convenience label to each discussion
# The label id was captured during startup
#
def addLabelsToLabelable(nodeid, labid):
query = """
mutation
{
addLabelsToLabelable(input:
{
clientMutationId:\"scratlantis:emai2discussions.py\",
labelIds:[\"%s\"],
labelableId:\"%s\"
})
{
labelable
{
labels(first:1)
{
edges
{
node
{
id
}
}
}
}
}
}"""%(labid, nodeid)
try:
result = run_query(query)
except:
captureGraphQlFailureDetails('addLabelsToLabelable %s'%nodeid, query,
repr(result) if 'result' in locals() else "")
#
# Method to filter subject lines
#
def filterSubject(su):
if not su:
return "no subject"
# Handle occasional odd-ball encoding
suparts = email.header.decode_header(su)
newsu = ''
for p in suparts:
if isinstance(p[0],bytes):
try:
newsu += p[0].decode('utf-8')
except UnicodeDecodeError:
newsu += ''.join([chr(i) if i < 128 else ' ' for i in p[0]])
else:
newsu += p[0]
su = newsu
# handle line-wraps and other strange whitespace
su = re.sub('\s+',' ', su)
su = su.replace('"',"'")
su = su.lower()
# Get rid of all these terms
stringsToRemove = ['visit-users', '[external]', 'visit-dav/live-customer-response', '[bulk]',
'[ext]', '[github]', '[ieee_vis]', '[sec=unclassified]', '[sec=unofficial]',
'[solved]', '[visit-announce]', '[visit-commits]',
'visit-core-support', 'visit-dav/visit', 'visit-developers', 'visit-help-asc',
'visit-help-scidac', 're:', 're :', 'fwd:', 'fw:', '[unclassifed]',
'[non-dod source]', 'possible spam', 'suspicious message', 'warning: attachment unscanned',
'warning: unscannable extraction failed', '[]', '()', '{}']
for s in stringsToRemove:
su = su.replace(s,'')
# Get rid of GitHub bug identifiers
su = re.sub('\s+\(#[0-9]*\)','',su)
return su.strip()
#
# Replacement function for re.sub to replace phone number matches with
# a string of the same number of characters
#
def overwriteChars(m):
return 'X' * len(m.group())
#
# Method to filter body. Currently designed towards the notion that the
# body will be rendered as "code" (between ```) and not GitHub markdown.
#
wrapper = textwrap.TextWrapper(width=100)
def filterBody(body):
retval = body[:20000]+' truncated...' if (len(body)) > 20000 else body
# filter out anything that looks like a phone number including international #'s
# Unfortunately, this can corrupt any lines of raw integer or floating point
# data in the email body. Masking telephone numbers trumps raw data though.
retval = re.sub('[ ({[]?[0-9]{3}[ )}\]]?[-\.+=: ]?[0-9]{3}[-\.+=: ]?[0-9]{4}',
overwriteChars,retval,0,re.MULTILINE)
retval = re.sub('[ ({[]?[0-9]{3}[ )}\]]?[-\.+=: ]?[0-9]{4}[-\.+=: ]?[0-9]{4}',
overwriteChars,retval,0,re.MULTILINE)
retval = re.sub('[ ({[]?[0-9]{2}[ )}\]]?[-\.+=: ]?[ ([]?[0-9]{2}[ )\]]?[-\.+=: ]?[0-9]{4}[-\.+=: ]?[0-9]{4}',
overwriteChars,retval,0,re.MULTILINE)
retval = re.sub('[ ({[]?[0-9]{3}[ )}\]]?[-\.+=: ]?[ ([]?[0-9]{1}[ )\]]?[-\.+=: ]?[0-9]{3}[-\.+=: ]?[0-9]{4}',
overwriteChars,retval,0,re.MULTILINE)
retval = re.sub('[ ({[]?[0-9]{3}[ )}\]]?[-\.+=: ]?[ ([]?[0-9]{1}[ )\]]?[-\.+=: ]?[0-9]{2}[-\.+=: ]?[0-9]{2}[-\.+=: ]?[0-9]{2}[-\.+=: ]?[0-9]{2}',
overwriteChars,retval,0,re.MULTILINE)
# Remove these specific lines. In many cases, these lines are quoted and
# re-wrapped (sometimes with chars inserted in arbitrary places) so this isn't
# foolproof.
retval = re.sub('^[>\s]*VisIt Users Wiki: http://[ +_\*]*visitusers.org/.*$', '',retval,0,re.MULTILINE)
retval = re.sub('^[>\s]*Frequently Asked Questions for VisIt: http://[ +_\*]*visit.llnl.gov/FAQ.html.*$', '',retval,0,re.MULTILINE)
retval = re.sub('^[>\s]*To Unsubscribe: send a blank email to visit-developers-unsubscribe at elist.ornl.gov.*$', '',retval,0,re.MULTILINE)
retval = re.sub('^[>\s]*More Options: https://[ +_\*]*elist.ornl.gov/mailman/listinfo/visit-developers.*$', '',retval,0,re.MULTILINE)
retval = re.sub('^[>\s]*To Unsubscribe: send a blank email to.*$', '',retval,0,re.MULTILINE)
retval = re.sub('^[>\s]*visit-developers-unsubscribe at elist.ornl.gov.*$', '',retval,0,re.MULTILINE)
retval = re.sub('^[>\s]*-------------- next part --------------.*$', '',retval,0,re.MULTILINE)
retval = re.sub('^[>\s]*An HTML attachment was scrubbed\.\.\..*$', '',retval,0,re.MULTILINE)
retval = re.sub('^[>\s]*URL: <?https://[ +_\*]*elist.ornl.gov/pipermail/visit-developers/attachments.*$', '',retval,0,re.MULTILINE)
retval = re.sub('^[>\s]*URL: <?https://[ +_\*]*email.ornl.gov/pipermail/visit-developers/attachments.*$', '',retval,0,re.MULTILINE)
retval = re.sub('^[>\s]*List subscription information: https://[ +_\*]*email.ornl.gov/mailman/listinfo/visit-developers.*$', '',retval,0,re.MULTILINE)
retval = re.sub('^[>\s]*Searchable list archives: https://[ +_\*]*email.ornl.gov/pipermail/visit-users.*$', '',retval,0,re.MULTILINE)
#
# Filter out signature separator lines (e.g. '--') as these convince
# GitHub the message is really HTML formatted
#
retval = re.sub('^\s*-+\s*$','\n---\n',retval,0,re.MULTILINE)
# Take out some characters that cause problems with http/json parsing
retval = retval.replace('\\',' ')
retval = retval.replace('"',"'")
# Take out some characters that might be intepreted as premature end
# of the code block in which this body text is being embedded.
retval = retval.replace('```',"'''")
retval = retval.replace('~~~',"'''")
# wrap the body text for GitHub text box size (because we're going to
# literal quote it (```) and want to avoid creating content that requires
# horiz. scroll
mylist = []
for s in retval.split('\n'):
more = wrapper.wrap(s.strip())
if not more and mylist and mylist[-1] != '':
mylist += ['']
else:
mylist += more
retval = '\n'.join(mylist)
return retval
#
# Simple method to build body text for a message
#
def buildBody(msgObj):
body = ''
body += '**Date: %s**\n'%mydt(msgObj['Date']).strftime('%a, %d %b %Y %H:%M:%S %z')
body += '**From: %s**\n'%msgObj['From']
body += 'This post was [imported from the visit-developers@ornl.gov email archive](https://github.com/visit-dav/visit/wiki/About-the-visit-users@ornl.gov-email-archive)\n'
body += '\n---\n```\n'
body += filterBody(msgObj.get_payload())
body += '\n```\n---\n'
return body
#
# load the restart file
#
def restartFromRestart():
processedKeys = []
if os.path.exists('email2discussions-restart.txt'):
if not os.access('email2discussions-restart.txt', os.R_OK):
raise RuntimeError('It appears a previous run has fully completed. Remove "email2discussions-restart.txt" to rerun.')
with open('email2discussions-restart.txt', 'r') as f:
processedKeys = [l.strip() for l in f.readlines()]
return processedKeys
#
# Update our restart file
#
def updateRestart(k):
with open('email2discussions-restart.txt', 'a') as f:
f.write(k)
f.write('\n')
#
# Debug: write message strings to be used in http/json graphql
# queries to text files
#
def testWriteMessagesToTextFiles(msgLists):
shutil.rmtree("email2discussions-debug", ignore_errors=True)
os.mkdir("email2discussions-debug")
processedKeys = restartFromRestart()
i = 0
for k in list(msgLists.keys()):
if k in processedKeys:
print("Already processed \"%s\""%k)
continue
# Get the current message thread
mlist = msgLists[k]
# Create a valid file name from message id (key)
#time.sleep(1)
kfname = k.replace("/","_").replace('<','_').replace('>','_')[:100]
# assumes 'tmp' is dir already available to write to
with open("email2discussions-debug/%s"%kfname, 'w') as f:
subject = filterSubject(mlist[0]['Subject'])
body = buildBody(mlist[0])
if subject == 'No subject' or subject == 'no subject' or subject == '':
for s in body.split('\n')[5:]:
if s.strip():
subject = s.strip()
break
print("Working on thread %d, \"%s\""%(i,k))
print(" %d messages, subject \"%s\""%(len(mlist),subject))
f.write("Subject: \"%s\"\n"%subject)
f.write(body)
f.write("\n")
for m in mlist[1:]:
body = buildBody(m)
f.write(body)
f.write("\n")
i += 1
updateRestart(k)
# indicate run fully completed
os.chmod('email2discussions-restart.txt', 0)
#
# Loop over the message list, adding each thread of
# messages as a discussion with comments
#
def importMessagesAsDiscussions(msgLists, repoid, catid, labid):
# look for restart file
processedKeys = restartFromRestart()
# for k in list(msgLists.keys()): don't do whole shootin match yet
i = 0
for k in list(msgLists.keys()):
i += 1
if k in processedKeys:
print("Already processed \"%s\""%k)
continue
# Make sure we don't exceed GitHub's GraphQL API limits
throttleRate()
# Get the current message thread
mlist = msgLists[k]
# Use first message (index 0) in thread for subject to
# create a new discussion topic
subject = filterSubject(mlist[0]['Subject'])
print("Working on thread %d of %d (%d messages, subject = \"%s\")"%(i,len(msgLists.keys()),len(mlist),k))
body = buildBody(mlist[0])
discid = createDiscussion(repoid, catid, subject, body)
# label this discussion for easy filtering
addLabelsToLabelable(discid, labid)
# Use remaining messages in thread (starting from index 1)
# to add comments to this discussion
for m in mlist[1:]:
body = buildBody(m)
addDiscussionComment(discid, body)
# lock the discussion to prevent any non-owners from
# ever adding to it
lockLockable(discid)
#
# Update restart state
#
updateRestart(k)
# indicate run fully completed
os.chmod('email2discussions-restart.txt', 0)
#
# Main Program
#
# Read all email messages into a list sorted by date
items = readAllMboxFiles()
# Remove duplicates
items = removeDateAndSubjectDups(items)
# Thread the messages
msgLists = threadMessages(items)
# Eliminate common bad cases
removeBadMessages(msgLists)
#printDiagnostics(msgLists)
#testWriteMessagesToTextFiles(msgLists)
#sys.exit(0)
# Get the repository id where the discussions will be created
repoid = GetRepoID("visit-dav", "visit")
# Get the discussion category id for the email migration discussion
catid = GetObjectIDByName("visit-dav", "visit", "discussionCategories", 10, "visit-developers email archive")
# Get the label id for the 'visit-uers email'
labid = GetObjectIDByName("visit-dav", "visit", "labels", 30, "email archive")
# Import all the message threads as discussions
importMessagesAsDiscussions(msgLists, repoid, catid, labid)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment