Last active
February 26, 2021 15:57
-
-
Save dwinston/af77d9cdebb6bc3cf51e847cc44463c4 to your computer and use it in GitHub Desktop.
get all Zulip messages sent by non-bot users to public streams
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
A script developed to get all Zulip messages sent by non-bot users to public streams. | |
Need to pip install pymongo tqdm zulip, and run a local MongoDB server. | |
But you can also adapt the script to append to an in-memory Python list, and not need MongoDB or pymongo. | |
I found that the total volume of data in my case (see in-script comments) was 700MB uncompressed. | |
Developed at the Recurse Center (https://www.recurse.com/) in order to apply PageRank to Zulip entities. | |
Licensed as <https://opensource.org/licenses/MIT>(year=2021, copyright_holder="Donny Winston"). | |
""" | |
from pprint import pprint | |
import time | |
from pymongo import MongoClient | |
from tqdm import tqdm | |
import zulip | |
zclient = zulip.Client(config_file="~/.zuliprc") | |
mclient = MongoClient() | |
db = mclient["rc_mldp"] | |
result = zclient.get_members() | |
users_no_bots = sorted([m["user_id"] for m in result["members"] if not m["is_bot"]]) | |
def get_messages_from(sender, anchor='oldest', num_after=1000): | |
"""Get up to num_after messages from sender to public streams, starting with anchor.""" | |
return { | |
"anchor": anchor, | |
"num_before": 0, | |
"num_after": num_after, | |
"narrow": [ | |
{"operator": "sender", "operand": sender}, | |
{"operator": "streams", "operand": "public"}, | |
] | |
} | |
def bulk_get_messages_from(sender, first_anchor='oldest', batch_size=1000, limit=None): | |
total = 0 | |
anchor = first_anchor | |
pbar = tqdm(total=limit) | |
limit = limit or 1e18 | |
while total < limit: | |
request = get_messages_from(sender, anchor, num_after=batch_size) | |
result = zclient.get_messages(request) | |
messages = result.get("messages", []) | |
if messages: | |
assert all(m["sender_id"] == sender for m in messages) | |
db.messages.insert_many(messages) | |
total += len(messages) | |
pbar.update(len(messages)) | |
if result.get("found_newest"): | |
break | |
if result.get("code") == "RATE_LIMIT_HIT": | |
time.sleep(result.get("retry-after", 1) + 0.1) | |
else: | |
anchor = max(m["id"] for m in messages) + 1 | |
pbar.close() | |
return "OK" | |
# In my case, ~13 minutes total to fetch ~800k messages => ~700MB uncompressed | |
# for https://recurse.zulipchat.com/ on 2021-02-05 | |
for uid in tqdm(users_no_bots): | |
bulk_get_messages_from(uid) | |
db.messages.create_index("id", unique=True) | |
db.messages.create_index("sender_id") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment