Skip to content

Instantly share code, notes, and snippets.

@apoorvalal
Last active October 1, 2023 17:40
Show Gist options
  • Save apoorvalal/eb3efb0996dba73f2d65481cb32907dc to your computer and use it in GitHub Desktop.
Save apoorvalal/eb3efb0996dba73f2d65481cb32907dc to your computer and use it in GitHub Desktop.
bot to check arxiv rss feeds and post updates to bsky. adapt to your own needs.
#!/home/alal/anaconda3/bin/python3
"""
Script to post new articles from arxiv stat.ME and econ.EM. Bring your own handle and app-password.
"""
# %%
import time
import re
import os
import sys
import json
from typing import List, Dict
from datetime import datetime, timezone
import random
import requests
import feedparser
# %%
def bsky_login_session(pds_url: str, handle: str, password: str) -> Dict:
"""login to blueksy
Args:
pds_url (str): bsky platform (default for now)
handle (str): username
password (str): app password
Returns:
Dict: json blob with login
"""
resp = requests.post(
pds_url + "/xrpc/com.atproto.server.createSession",
json={"identifier": handle, "password": password},
)
resp.raise_for_status()
return resp.json()
def parse_urls(text: str) -> List[Dict]:
"""parse URLs in string blob
Args:
text (str): string
Returns:
List[Dict]: span of url
"""
spans = []
# partial/naive URL regex based on: https://stackoverflow.com/a/3809435
# tweaked to disallow some training punctuation
url_regex = rb"[$|\W](https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*[-a-zA-Z0-9@%_\+~#//=])?)"
text_bytes = text.encode("UTF-8")
for m in re.finditer(url_regex, text_bytes):
spans.append(
{
"start": m.start(1),
"end": m.end(1),
"url": m.group(1).decode("UTF-8"),
}
)
return spans
def parse_facets(text: str) -> List[Dict]:
"""
parses post text and returns a list of app.bsky.richtext.facet objects for any URLs (https://example.com)
"""
facets = []
for u in parse_urls(text):
facets.append(
{
"index": {
"byteStart": u["start"],
"byteEnd": u["end"],
},
"features": [
{
"$type": "app.bsky.richtext.facet#link",
# NOTE: URI ("I") not URL ("L")
"uri": u["url"],
}
],
}
)
return facets
def create_post(
text: str,
pds_url: str = "https://bsky.social",
handle: str = os.environ["BSKYBOT"],
password: str = os.environ["BSKYPWD"],
):
"""post on bluesky
Args:
text (str): text
pds_url (str, optional): bsky Defaults to "https://bsky.social".
handle (_type_, optional): Defaults to os.environ["BSKYBOT"]. Set this environmental variable in your dotfile (bashrc/zshrc).
password (_type_, optional): _description_. Defaults to os.environ["BSKYPWD"].
"""
session = bsky_login_session(pds_url, handle, password)
now = datetime.now(timezone.utc).isoformat().replace("+00:00", "Z")
# these are the required fields which every post must include
post = {
"$type": "app.bsky.feed.post",
"text": text,
"createdAt": now,
}
# parse out mentions and URLs as "facets"
if len(text) > 0:
facets = parse_facets(post["text"])
if facets:
post["facets"] = facets
resp = requests.post(
pds_url + "/xrpc/com.atproto.repo.createRecord",
headers={"Authorization": "Bearer " + session["accessJwt"]},
json={
"repo": session["did"],
"collection": "app.bsky.feed.post",
"record": post,
},
)
print("createRecord response:", file=sys.stderr)
print(json.dumps(resp.json(), indent=2))
resp.raise_for_status()
# %%
def get_arxiv_feed(subject: str):
"""get skeetable list of paper title, link, and (fragment of) abstract
Args:
subject (str): valid arxiv subject, e.g. "stat.ME" or "econ.EM" or "cs.LG"
Returns:
list of skeets
"""
feed_url = f"https://export.arxiv.org/rss/{subject}"
feed = feedparser.parse(feed_url)
items = [
f"{entry.title.split('.')[0].strip()}\n{entry.link.strip()}\n{entry.description.replace('<p>', '').replace('</p>', '').strip()}"
for entry in feed.entries
]
return [item[:296].replace("\n", " ") + "\n📈🤖" for item in items]
# %%
def main():
# stats
stat_entries = get_arxiv_feed("stat.ME")
for entry in stat_entries:
create_post(text=entry)
time.sleep(random.randint(1, 5))
# metrics
em_entries = get_arxiv_feed("econ.EM")
for entry in em_entries:
create_post(text=entry)
time.sleep(random.randint(1, 5))
# ml - #toomuchcontent
# ml_entries = get_arxiv_feed("stat.ML")
# for entry in ml_entries:
# create_post(text=entry)
# time.sleep(random.randint(0, 2))
# %%
if _name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment