Skip to content

Instantly share code, notes, and snippets.

@dwinston
Last active February 26, 2021 15:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dwinston/af77d9cdebb6bc3cf51e847cc44463c4 to your computer and use it in GitHub Desktop.
Save dwinston/af77d9cdebb6bc3cf51e847cc44463c4 to your computer and use it in GitHub Desktop.
get all Zulip messages sent by non-bot users to public streams
"""
A script developed to get all Zulip messages sent by non-bot users to public streams.
Need to pip install pymongo tqdm zulip, and run a local MongoDB server.
But you can also adapt the script to append to an in-memory Python list, and not need MongoDB or pymongo.
I found that the total volume of data in my case (see in-script comments) was 700MB uncompressed.
Developed at the Recurse Center (https://www.recurse.com/) in order to apply PageRank to Zulip entities.
Licensed as <https://opensource.org/licenses/MIT>(year=2021, copyright_holder="Donny Winston").
"""
from pprint import pprint
import time
from pymongo import MongoClient
from tqdm import tqdm
import zulip
zclient = zulip.Client(config_file="~/.zuliprc")
mclient = MongoClient()
db = mclient["rc_mldp"]
result = zclient.get_members()
users_no_bots = sorted([m["user_id"] for m in result["members"] if not m["is_bot"]])
def get_messages_from(sender, anchor='oldest', num_after=1000):
"""Get up to num_after messages from sender to public streams, starting with anchor."""
return {
"anchor": anchor,
"num_before": 0,
"num_after": num_after,
"narrow": [
{"operator": "sender", "operand": sender},
{"operator": "streams", "operand": "public"},
]
}
def bulk_get_messages_from(sender, first_anchor='oldest', batch_size=1000, limit=None):
total = 0
anchor = first_anchor
pbar = tqdm(total=limit)
limit = limit or 1e18
while total < limit:
request = get_messages_from(sender, anchor, num_after=batch_size)
result = zclient.get_messages(request)
messages = result.get("messages", [])
if messages:
assert all(m["sender_id"] == sender for m in messages)
db.messages.insert_many(messages)
total += len(messages)
pbar.update(len(messages))
if result.get("found_newest"):
break
if result.get("code") == "RATE_LIMIT_HIT":
time.sleep(result.get("retry-after", 1) + 0.1)
else:
anchor = max(m["id"] for m in messages) + 1
pbar.close()
return "OK"
# In my case, ~13 minutes total to fetch ~800k messages => ~700MB uncompressed
# for https://recurse.zulipchat.com/ on 2021-02-05
for uid in tqdm(users_no_bots):
bulk_get_messages_from(uid)
db.messages.create_index("id", unique=True)
db.messages.create_index("sender_id")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment