Skip to content

Instantly share code, notes, and snippets.

@benjaminkreen
Last active July 19, 2019 21:28
Show Gist options
  • Save benjaminkreen/4a6bc6c30637dd07fd66876a63e784ff to your computer and use it in GitHub Desktop.
Save benjaminkreen/4a6bc6c30637dd07fd66876a63e784ff to your computer and use it in GitHub Desktop.
Gets alert results based on NED alerts
require 'httparty'
require 'date'
# This currently just gets all of the alerts and fetches the resulting documents. A good way to test this would be to
# compare the returned count of documents. if its close that would be great.
# here are all the keys that exist in the query column of the alerts table.
# ["startPage", "filterSubjects", "unformattedQuery", "query", "eLocationId", "pageSize", "resultView", "volume", "sortValue", "sortKey", "filterKeyword", "filterArticleTypes", "filterSubjectsDisjunction", "filterAuthors", "filterJournals", "id", "sort", "filterArticleType", "filterStartDate", "filterEndDate", "resultsPerPage", "sortOrder", "page", "x", "y", "from", "q", "filterSections"]
ALLOWED_QUERY_KEYS = ["q", "filterSubjects", "volume", "filterKeyword", "filterArticleTypes", "filterSubjectsDisjunction", "filterAuthors", "filterJournals","filterArticleType", "filterSections", "query", "unformattedQuery"]
WOMBAT_SEARCH_URL = 'https://collections.plos.org/dynamicSearch'
SOLR_API_URL = 'https://api.plos.org/search'
MAX_ARTICLES = 50
EMAIL_URL = 'https://journals.plos.org/plosone/search'
PONE_TEMPLATE_ID = 'd-85c86529aaf24a82b891745f2c76b0e9'
SENDGRID_API_KEY = 'your-key'
SENDGRID_API_URL = 'https://api.sendgrid.com/v3/mail/send'
# Why try to deconstruct the wombat abstraction to make it solr compilant again? Just use wombat!
# First get alerts, weekly or monthly. I've cached them to a file
# TODO: implement NED API
alerts = JSON.parse(File.read('weekly_saved_searches.json'))
alerts.select { |x| x['name'] == 'PLoSONE' }.sample(2).each do |alert|
# here we clean out a bunch of escape cruft and only keep the queries we care about
# not sure why the other stuff is in there
# puts alert
original_query = JSON.parse(alert["query"].squeeze("\\"))
query = original_query.slice(*ALLOWED_QUERY_KEYS).select { |k,v| !v.empty? }
# set date range
day_offset = alert['frequency'] == 'monthly' ? 30 : 7
start_date = Date.today - day_offset
query['filterStartDate'] = start_date.to_s
query['filterEndDate'] = Date.today.to_s
# query manipulation: there are a view fields that qualify as queries
possible_queries = []
if query['filterSubjectsDisjunction']
# Here's the only non-wombat friendly thing: Disjunction is a fancy word for 'OR'. Since these come from
# the akita ui, there no wombat query per se, so we just build our own OR'd query.
subject_query = query.delete('filterSubjectsDisjunction').inject('') do |q, subj|
q.empty? ? "subject:\"#{subj}\"" : "(#{q}) OR subject:\"#{subj}\""
end
possible_queries.push(subject_query)
end
# there seems to have been a mess with what the query actually is. I've decided this is the precedence order, falling back on *:*
query_keys = ['q','query', 'unformattedQuery']
query_keys.each { |key| possible_queries.push(query.delete(key)) }
possible_queries.push('*:*')
query['q'] = possible_queries.compact.reject(&:empty?).first
json_request_headers = { 'Accept' => 'application/json' }
sendgrid_headers = {
'Authorization' => "Bearer #{SENDGRID_API_KEY}",
'Content-Type' => 'application/json'
}
sendgrid_params = {
from: { email: 'news@lists.plos.org' },
personalizations: [
{
to: [
{ email: 'bmiller@plos.org' } # TODO: fetch correct email
],
dynamic_template_data: {
start_date: start_date.strftime('%b %d %Y'),
end_date: Date.today.strftime('%b %d %Y')
}
}
]
}
puts alert
if alert['name'] == 'PLoSONE'
# For PONE searches
solr_params = {
wt: 'json',
fl: 'title,subject,id,author',
q: "publication_date:[#{start_date.strftime('%FT%TZ')} TO #{Date.today.strftime('%FT%TZ')}] AND #{query['q']} AND doc_type:\"full\""
}
resp = HTTParty.get(SOLR_API_URL + '?' + URI.encode_www_form(solr_params), headers: json_request_headers)
sleep 1 # Requesting too fast
search_results = resp.parsed_response['response']
over_max = search_results['numFound'].to_i > MAX_ARTICLES
sendgrid_params[:template_id] = PONE_TEMPLATE_ID
sendgrid_params[:personalizations][0][:dynamic_template_data][:over_max] = over_max
sendgrid_params[:personalizations][0][:dynamic_template_data][:subjects] = []
original_query['filterSubjectsDisjunction'].each do |subj|
# assemble doc data for subject
subj_docs = search_results['docs'].inject([]) do |list, doc|
if doc['subject'].any? { |doc_subj| doc_subj.include?(subj) }
list.push({title: doc['title'], authors: doc['author'].join(', '), doi: doc['id']})
end
list
end
# assemble url data if necessary
url = nil
if subj_docs.count.zero? || over_max
url_params = { unformattedQuery: "subject:\"#{subj}\"" }
url_params.merge!(query.slice('filterJournals')) unless subj_docs.count.zero? # link to other journals if no results
url_params.merge!(query.slice('filterStartDate', 'filterEndDate')) if over_max # don't filter by date
url = EMAIL_URL + '?' + URI.encode_www_form(url_params)
end
# TODO: support subject tier grouping
subj_data = {
name: subj,
articles: subj_docs,
url: url
}
sendgrid_params[:personalizations][0][:dynamic_template_data][:subjects].push(subj_data)
end
# send email
sendgrid_resp = HTTParty.post(SENDGRID_API_URL, { body: sendgrid_params.to_json, headers: sendgrid_headers })
puts sendgrid_resp.parsed_response
else
resp = HTTParty.get(WOMBAT_SEARCH_URL + '?' + URI.encode_www_form(query), headers: json_request_headers)
search_results = resp.parsed_response['searchResults']
end
# puts search_results
# puts "Found " + search_results['numFound'].to_i.to_s + " results for #{alert['name']}"
# search_results['docs'].take(3).each { |doc| puts doc['title'] } # print out top 3
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment