Created
May 28, 2018 15:23
-
-
Save wolny/978bdc54e5d9888bf281629c002c5664 to your computer and use it in GitHub Desktop.
Mailman list members export based on https://wiki.list.org/DOC/How%20do%20I%20extract%20%28export%29%20a%20list%20of%20my%20list%27s%20members%20%28subscribers%29%3F (ignore SSL cert verification)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""List the email addresses subscribed to a mailing list, fetched from web. | |
Usage: %(PROGRAM)s [options] hostname listname password | |
Where: | |
--output file | |
-o file | |
Write output to specified file instead of standard out. | |
--regular | |
-r | |
List only the regular (non-digest) members. | |
--digest={any|mime|plain} | |
-d {any|mime|plain} | |
List only the digest members. One of 'any', 'mime' or 'plain' | |
is required. | |
'any' lists all the digest members. | |
'mime' lists only the mime digest members. | |
'plain' lists only the plain digest members. | |
--fullnames | |
-f | |
Include the full names in the output. | |
--nomail={any|admin|bounce|user|unknown|enabled} | |
-n {any|admin|bounce|user|unknown|enabled} | |
List members based on their nomail status. One of 'any', 'admin', | |
'bounce', 'user', 'unknown' or 'enabled' is required. | |
'any' lists members with delivery disabled for any reason. | |
'admin' lists members with delivery disabled by admin. | |
'bounce' lists members with delivery disabled by bounce. | |
'user' lists members with delivery disabled by the member. | |
'unknown' lists members with delivery disabled by mailman 2.0 | |
'enabled' lists members with delivery enabled. | |
--csv | |
-c | |
This option overrides the above four selection options and lists | |
all members, one per line, with comma separated, quoted values as | |
follows: | |
"full name" if available, else "","email address","mod", | |
"hide","nomail" ("off" or "[A]" or "[B]" or "[U]" or "[?]"), | |
"ack","not metoo","nodupes","digest","plain" | |
analogous to the admin membership list (the values of the 'checkbox' | |
fields are either "off" or "on"). A title line with the above names | |
is listed before the member lines. | |
--url_path path | |
-u path | |
If the list admin pages are accessed at your site via a URL of form | |
different from http://hostname/mailman/admin/listname, you need to | |
specify the path portion of the URL that is between hostname and | |
/listname with this option. For example, a URL such as | |
http://hostname/admin.cgi/listname requires the option | |
--url_path /admin.cgi | |
or | |
-u /admin.cgi | |
and a URL like http://hostname/cgi-bin/mailman/admin/listname | |
requires the option | |
--url_path /cgi-bin/mailman/admin | |
or | |
-u /cgi-bin/mailman/admin | |
Default value is /mailman/admin. | |
--unhide | |
-U | |
Set the 'hidden' flag off for all list members including those not | |
selected for output. This will take a long time if there are a lot | |
of hidden members. The -v option prints '.' after every 100 unhides. | |
--ssl | |
-s | |
Use https instead of http for accessing the list. | |
--verbose | |
-v | |
Include extra progress output. | |
--help | |
-h | |
Print this help message and exit | |
hostname is the name used in the URL of the list's web interface | |
listname is the name of the mailing list | |
password is the list's admin password | |
The list of subscribers is fetched from the web administrative | |
interface. Using the bin/list_members program from a shell | |
account is preferable, but not always available. | |
Tested with the Mailman 2.1.5 - 2.1.18 Membership list layout. | |
If Python 2.4's cookielib is available, use it. Otherwise require | |
ClientCookie http://wwwsearch.sourceforge.net/ClientCookie/ | |
This script runs on your workstation and requires that you have Python | |
<http://www.python.org> installed. It works best with Python 2.4.x | |
through Python 2.7.x. It is not tested with Python 3.x.x. | |
""" | |
import sys | |
import re | |
import string | |
import urllib | |
import getopt | |
import httplib | |
import urllib2 | |
import ssl | |
from time import sleep | |
from HTMLParser import HTMLParser | |
# if we have Python 2.4's cookielib, use it | |
try: | |
import cookielib | |
policy = cookielib.DefaultCookiePolicy(rfc2965 = True) | |
cookiejar = cookielib.CookieJar(policy) | |
ctx = ssl._create_unverified_context() | |
opener = urllib2.build_opener(urllib2.HTTPSHandler(context=ctx), urllib2.HTTPCookieProcessor(cookiejar)).open | |
except ImportError: | |
import ClientCookie | |
# if this is a new ClientCookie, we need to turn on RFC2965 cookies | |
cookiejar = ClientCookie.CookieJar() | |
try: | |
cookiejar.set_policy(ClientCookie.DefaultCookiePolicy(rfc2965 = True)) | |
# install an opener that uses this policy | |
opener = ClientCookie.build_opener( | |
ClientCookie.HTTPCookieProcessor(cookiejar)) | |
ClientCookie.install_opener(opener) | |
except AttributeError: | |
# must be an old ClientCookie, which already accepts RFC2965 cookies | |
pass | |
opener = ClientCookie.urlopen | |
PROGRAM = sys.argv[0] | |
try: | |
True, False | |
except NameError: | |
True = 1 | |
False = 0 | |
def usage(code, msg=''): | |
if code: | |
fd = sys.stderr | |
else: | |
fd = sys.stdout | |
print >> fd, __doc__ % globals() | |
if msg: | |
print >> fd, msg | |
sys.exit(code) | |
subscribers = {} | |
vnames = ['_realname', '_mod', '_hide', '_nomail', '_ack', '_notmetoo', | |
'_nodupes', '_digest', '_plain'] | |
maxchunk = 0 | |
letters = ['0'] | |
processed_letters = [] | |
gotnomail = False | |
class MailmanHTMLParser(HTMLParser): | |
'''cheap way to find email addresses and pages with multiple | |
chunks from Mailman 2.1.5 membership pages''' | |
def handle_starttag(self, tag, attrs): | |
global maxchunk, letters, gotnomail, subemail, url_path | |
if tag == 'input': | |
for vname in vnames: | |
s = False | |
for a,v in attrs: | |
if a == 'name' and v.endswith(vname): | |
subemail = v[:-len(vname)] | |
s = True | |
elif a == 'value': | |
subval = v | |
if s: | |
if not subscribers.has_key(subemail): | |
subscribers[subemail] = {} | |
if vname == '_nomail' and subval == "on": | |
gotnomail = True | |
else: | |
if not isinstance(subval, unicode): | |
subval = subval.decode(page_cset, 'replace') | |
subscribers[subemail][vname] = subval.encode( | |
my_cset, 'replace') | |
if tag == 'a': | |
for a,v in attrs: | |
if a == 'href' and v.find("%s/" % (url_path)) >= 0: | |
m = re.search(r'chunk=(?P<chunkno>\d+)', v, re.I) | |
if m: | |
if int(m.group('chunkno')) > maxchunk: | |
maxchunk = int(m.group('chunkno')) | |
m = re.search(r'letter=(?P<letter>.)', v, re.I) | |
if m: | |
letter = m.group('letter') | |
if letter not in letters + processed_letters: | |
letters.append(letter) | |
def handle_data(self, data): | |
global gotnomail, subemail | |
if gotnomail: | |
gotnomail = False | |
subscribers[subemail]['_nomail'] = data | |
def main(): | |
global maxchunk, letters, url_path, my_cset, page_cset | |
try: | |
opts, args = getopt.getopt(sys.argv[1:], "ho:rd:fn:cu:Uvs", | |
["help", "output=", "regular", "digest=", "fullnames", | |
"nomail=", "csv", "url_path=", "unhide", "verbose", | |
"ssl"]) | |
except: | |
usage(2) | |
fp = sys.stdout | |
fullnames = False | |
nomail = None | |
verbose = False | |
regular = False | |
digest = None | |
csv = False | |
unhide = False | |
protocol = 'http' | |
url_path = '/mailman/admin' | |
for o,a in opts: | |
if o in ("-v", "--verbose"): | |
verbose = True | |
if o in ("-h", "--help"): | |
usage(0) | |
if o in ("-o", "--output"): | |
fp = open(a, "wt") | |
if o in ("-f", "--fullnames"): | |
fullnames = True | |
if o in ("-n", "--nomail"): | |
nomail = a.lower() | |
if o in ("-r", "--regular"): | |
regular = True | |
if o in ("-d", "--digest"): | |
digest = a.lower() | |
if o in ("-c", "--csv"): | |
csv = True | |
if o in ("-u", "--url_path"): | |
url_path = a | |
if o in ("-U", "--unhide"): | |
unhide = True | |
if o in ("-s", "--ssl"): | |
protocol = 'https' | |
if regular and digest: | |
usage(2, "Both 'regular' and 'digest' will produce an empty list.") | |
if digest not in [None, 'any', 'mime', 'plain']: | |
usage(2, "Digest type %s unrecognized" % digest) | |
if nomail not in [None, 'any', 'admin', 'bounce', 'user', 'unknown', | |
'enabled']: | |
usage(2, "Nomail type %s unrecognized" % nomail) | |
if len(args) != 3: | |
usage(2) | |
member_url = '%s://%s%s/%s/members' % (protocol, args[0], url_path, | |
args[1]) | |
options_url = '%s://%s%s/%s' % (protocol, args[0], | |
re.sub('admin', 'options', url_path), | |
args[1]) | |
p = {'adminpw':args[2]} | |
def_cset = sys.getdefaultencoding() | |
if def_cset.lower().endswith('ascii'): | |
def_cset = 'iso-8859-1' | |
my_cset = sys.stdout.encoding or def_cset | |
# login, picking up the cookie | |
try: | |
page = opener(member_url, urllib.urlencode(p)) | |
except (urllib2.URLError, httplib.InvalidURL), e: | |
if isinstance(e, urllib2.HTTPError) and e.code == 401: | |
usage(1, 'Invalid password.') | |
else: | |
print >> sys.stderr, repr(e) | |
usage(1, """Error accessing %s | |
Supplied host or listname may be incorrect, | |
or you may need to specify --url_path. | |
""" % (member_url)) | |
# Get the charset of the page, but use iso-8859-1 for ascii or None. | |
page_cset = page.info().getparam('charset') or 'iso-8859-1' | |
if page_cset.lower().endswith('ascii'): | |
page_cset = 'iso-8859-1' | |
lines = page.read() | |
page.close() | |
p = {} | |
# Try to recognize the returned page independent of the list language | |
if re.search(r'INPUT\s+type="SUBMIT"\s+name="admlogin"', lines, | |
re.M + re.I): | |
# login page - invalid password | |
usage(1, | |
'Login invalid - possibly incorrect password or missing -s option.') | |
if not re.search(r'<form\s+action=', lines, re.M + re.I): | |
# no <form> tag - admin overview page | |
usage(1, """Non-existent list: %s. | |
If the provided list name is valid, the supplied host may be incorrect | |
or you may need to specify --url_path. | |
""" % args[1]) | |
# loop through the letters, and all chunks of each | |
while len(letters) > 0: | |
letter = letters[0] | |
letters = letters[1:] | |
processed_letters.append(letter) | |
chunk = 0 | |
maxchunk = 0 | |
while chunk <= maxchunk: | |
if verbose: | |
print >> sys.stderr, "%c(%d)" % (letter, chunk) | |
while True: | |
try: | |
page = opener(member_url + "?letter=%s&chunk=%d" % | |
(letter, chunk)) | |
lines = page.read() | |
page.close() | |
except urllib2.URLError: | |
if verbose: | |
print >> sys.stderr,\ | |
'Error encountered in accessing web page.',\ | |
'Retrying.' | |
sleep(2) | |
else: | |
break | |
parser = MailmanHTMLParser() | |
parser.feed(lines) | |
parser.close() | |
chunk += 1 | |
subscriberlist = subscribers.items() | |
subscriberlist.sort() | |
# print the subscribers list | |
if csv: | |
print >>fp, '"Full name","email address","mod","hide",\ | |
"nomail","ack","not metoo","nodupes","digest","plain"' | |
nunhide = 0 | |
for (email, d) in subscriberlist: | |
if unhide and d['_hide'] == "on": | |
params = urllib.urlencode({'conceal':0, | |
'options-submit':1}) | |
u = opener("%s/%s" % (options_url, email), params) | |
u.close() | |
d['_hide'] = "off" | |
nunhide += 1 | |
if verbose and nunhide % 100 == 0: | |
print >>sys.stderr, '.', | |
email = urllib.unquote(email) | |
if csv: | |
print >>fp,\ | |
'"%s","%s","%s","%s","%s","%s","%s","%s","%s","%s"'\ | |
% (d['_realname'], email, d['_mod'], d['_hide'], | |
d['_nomail'], d['_ack'], d['_notmetoo'], | |
d['_nodupes'], d['_digest'], d['_plain']) | |
continue | |
if nomail == 'enabled' and d['_nomail'] <> "off": | |
continue | |
if nomail == 'any' and d['_nomail'] == "off": | |
continue | |
if nomail == 'admin' and d['_nomail'] <> "[A]": | |
continue | |
if nomail == 'bounce' and d['_nomail'] <> "[B]": | |
continue | |
if nomail == 'user' and d['_nomail'] <> "[U]": | |
continue | |
if nomail == 'unknown' and d['_nomail'] <> "[?]": | |
continue | |
if regular and d['_digest'] == "on": | |
continue | |
if digest and d['_digest'] == "off": | |
continue | |
if digest == "mime" and d['_plain'] == "on": | |
continue | |
if digest == "plain" and d['_plain'] == "off": | |
continue | |
if not fullnames or d['_realname'] == "": | |
print >>fp, email | |
else: | |
print >>fp, '%s <%s>' % (d['_realname'], email) | |
fp.close() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment