Skip to content

Instantly share code, notes, and snippets.

@amaya382
Last active January 9, 2020 08:10
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amaya382/a437ce5f9bffc79c742b39da731c1e89 to your computer and use it in GitHub Desktop.
Save amaya382/a437ce5f9bffc79c742b39da731c1e89 to your computer and use it in GitHub Desktop.
fetch and notify papers from arXiv
# based on https://www.takapy.work/entry/2019/07/15/121436
import os
import re
import json
import urllib.request
import datetime
def parse_xml(data, tag):
pattern = "<" + tag + ">([\s\S]*?)<\/" + tag + ">"
return re.findall(pattern, data)
def fetch_papers(categories, keywords, base_date, prev_date):
categories_query = "%28" + "+OR+".join(["cat:" + category for category in categories]) + "%29"
keywords_query = "%28" + "+AND+".join(["all:" + keyword for keyword in keywords]) + "%29"
url = "http://export.arxiv.org/api/query?search_query=submittedDate:[" + \
prev_date.strftime("%Y%m%d") + "0000+TO+" + \
base_date.strftime("%Y%m%d")+"0000]+AND+" + categories_query + "+AND+" + keywords_query
data = urllib.request.urlopen(url).read().decode("utf-8")
return parse_xml(data, "entry")
def notify(webhook, papers, interval, keywords):
key = ", ".join(keywords)
if len(papers) == 0:
post_to_slack(webhook, text=f"直近{interval}日にPublishされた論文はありませんでした [{key}]")
return
else:
post_to_slack(webhook, text=f"直近{interval}日にPublishされた{len(papers)}件の論文が見つかりました [{key}]")
for paper in papers:
url = parse_xml(paper, "id")[0]
raw_title = parse_xml(paper, "title")[0]
title = " ".join(raw_title.split())
date = parse_xml(paper, "published")[0]
date = date[:10]
author = ", ".join(parse_xml(paper, "name") )
summary = parse_xml(paper, "summary")[0]
summary = " ".join(summary.split())
attachment = {
"title": title,
"title_link": url,
"author_name": author,
"fields": [
{
"title": "Abstract",
"value": summary
},
{
"title": "Published",
"value": date
}
]
}
post_to_slack(webhook, attachments=[attachment])
def post_to_slack(webhook, text=None, attachments=None):
data = {}
if text is not None:
data["text"] = text
if attachments is not None:
data["attachments"] = attachments
encoded_data = json.dumps(data).encode("utf-8")
req = urllib.request.Request(webhook, data=encoded_data)
urllib.request.urlopen(req)
def lambda_handler(event, context):
webhook = event["webhook"] # https://hooks.slack.com/services/xxx/yyy/zzz
interval = event["interval"] # #days
categories = event["categories"] # ["cs.AI", "cs.CL", "cs.CV", "cs.DB", "cs.DC", "cs.DS", "cs.IR", "cs.NE", "stat.ML"]
keywords = event["keywords"] # ["some", "keywords"] # keywords you want
base_date = datetime.datetime.now() + datetime.timedelta(hours=9) # JST
prev_date = base_date + datetime.timedelta(days=-interval)
papers = fetch_papers(categories, keywords, base_date, prev_date)
notify(webhook, papers, interval, keywords)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment