Skip to content

Instantly share code, notes, and snippets.

@davidcesarino
Created October 8, 2019 17:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save davidcesarino/8c5e4939b48862ca77a693c53aa1b0a1 to your computer and use it in GitHub Desktop.
Save davidcesarino/8c5e4939b48862ca77a693c53aa1b0a1 to your computer and use it in GitHub Desktop.
Creates simple PubMed, Web of Science and CENTRAL search strategies for each supplied MeSH descriptor, intersecting (AND) them.
# coding=utf-8
# Copyright 2019 David Cesarino de Sousa, Fábio Antônio Serra de Lima Junior
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import requests
import justext
import sys
import re
label_header_app_name = "REQUEST: REsearch QUEry STrategy\n" \
"Intersect MeSH descriptors for PubMed, Web of Science and CENTRAL.\n" \
"(c) 2019 David Cesarino (davidcesarino@gmail.com)\n" \
" Fábio Serra (fabioserrajr@outlook.com)"
label_cut_start_prefix = "-> Please copy your "
label_cut_start_suffix = " search strategy below:"
label_must_provide_descriptor = "Usage: request.py \"Descriptor 1\" [\"Descriptor 2\"] [\"Descriptor 3\"] [...]\n\n" \
"Error: you must provide at least one descriptor as argument."
# Do not change anything past this line unless you know what you're doing.
label_pubmed = "PubMed"
label_wos = "Web of Science"
label_central = "CENTRAL"
mesh_descriptor_uri_prefix = "https://www.ncbi.nlm.nih.gov/mesh/?term="
mesh_content_delimiter_enter = ['Entry Terms:']
mesh_content_delimiter_exit = ['All MeSH Categories', 'Previous Indexing:']
regex_comma_with_optional_spaces = re.compile(' *, *')
near2_with_spaces_lower_case = ' near/2 '
near2_with_spaces_isolated = '" NEAR/2 "'
query_type_pubmed = 1
query_type_wos = 2
def is_args_valid():
return len(sys.argv) > 1
def validate_arguments():
if not is_args_valid():
print_header()
print("")
print(label_must_provide_descriptor)
exit(1)
def print_header():
print(label_header_app_name)
def print_query(service_name, query):
print(label_cut_start_prefix + service_name + label_cut_start_suffix)
print(query)
def mesh_is_delimiter_enter_reached(match_str):
for match_try in mesh_content_delimiter_enter:
if match_try in match_str:
return True
return False
def mesh_is_delimiter_exit_reached(match_str):
for match_try in mesh_content_delimiter_exit:
if match_try in match_str:
return True
return False
def mesh_get_descriptor_entry_terms(descriptor):
response = requests.get(mesh_descriptor_uri_prefix + descriptor)
paragraphs = justext.justext(response.content, justext.get_stoplist("English"))
found = False
names = []
for p in paragraphs:
match_str = p.text
if mesh_is_delimiter_exit_reached(match_str):
break
elif mesh_is_delimiter_enter_reached(match_str):
found = True
continue
elif found:
names.append(match_str)
continue
return names
def mesh_get_descriptors_with_entry_terms():
descriptors_with_entry_terms = []
skip_script_call = True
for descriptor in sys.argv:
if skip_script_call:
skip_script_call = False
continue
entry_terms = mesh_get_descriptor_entry_terms(descriptor)
descriptors_with_entry_terms.append((descriptor, entry_terms))
return descriptors_with_entry_terms
def get_comma_free(entry, isolated_joins):
joiner = near2_with_spaces_isolated if isolated_joins else near2_with_spaces_lower_case
if "," in entry:
replacement = re.sub(regex_comma_with_optional_spaces, joiner, entry)
return '("' + replacement + '")' if isolated_joins else replacement
else:
return '"' + entry + '"' if isolated_joins else entry
def get_li(line):
return "#" + str(line) + " "
def get_item(line):
return "#" + str(line)
def central_get_query(descriptors_with_entry_terms):
line = 0
root_lines = []
result = ''
for entry in descriptors_with_entry_terms:
line += 1
child_line = line
result += get_li(line) + "MeSH descriptor: [" + entry[0] + "] explode all trees" + "\n"
for entry_term in entry[1]:
line += 1
result += get_li(line) + get_comma_free(entry_term, False) + "\n"
line += 1
root_lines.append(line)
result += get_li(line)
refs_to_join = []
for i in xrange(child_line, line):
refs_to_join.append(get_item(i))
result += " or ".join(str(line_reference) for line_reference in refs_to_join) + "\n"
line += 1
result += get_li(line) + " and ".join(get_item(root_reference) for root_reference in root_lines)
return result
def pubmed_get_descriptor_query(descriptor, entry_terms):
all_terms = [descriptor] + entry_terms
pubmed_query = '('
pubmed_query += ' OR '.join(str("\"" + entry + "\"") for entry in all_terms)
pubmed_query += ')'
return pubmed_query
def wos_get_descriptor_query(descriptor, entry_terms):
all_terms = [descriptor] + entry_terms
wos_query = "TS=("
wos_query += ' OR '.join(str(get_comma_free(entry, True)) for entry in all_terms)
wos_query += ')'
return wos_query
def get_descriptor_query(query_type, descriptor_with_entry_terms):
if query_type == query_type_pubmed:
return pubmed_get_descriptor_query(descriptor_with_entry_terms[0], descriptor_with_entry_terms[1])
elif query_type == query_type_wos:
return wos_get_descriptor_query(descriptor_with_entry_terms[0], descriptor_with_entry_terms[1])
else:
raise Exception("[BUG] Invalid query type=[" + query_type + "]")
def get_joined_query(all_descriptors_queries):
return ' AND '.join(str(descriptor_query) for descriptor_query in all_descriptors_queries)
def get_query(query_type, descriptors_with_entry_terms):
individual_queries = []
for descriptor_with_entry_terms in descriptors_with_entry_terms:
individual_queries.append(get_descriptor_query(query_type, descriptor_with_entry_terms))
return get_joined_query(individual_queries)
def main():
validate_arguments()
descriptors_with_entry_terms = mesh_get_descriptors_with_entry_terms()
pubmed_query = get_query(query_type_pubmed, descriptors_with_entry_terms)
wos_query = get_query(query_type_wos, descriptors_with_entry_terms)
central_query = central_get_query(descriptors_with_entry_terms)
print_query(label_pubmed, pubmed_query)
print("")
print_query(label_wos, wos_query)
print("")
print_query(label_central, central_query)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment