Created
October 8, 2019 17:59
-
-
Save davidcesarino/8c5e4939b48862ca77a693c53aa1b0a1 to your computer and use it in GitHub Desktop.
Creates simple PubMed, Web of Science and CENTRAL search strategies for each supplied MeSH descriptor, intersecting (AND) them.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=utf-8 | |
# Copyright 2019 David Cesarino de Sousa, Fábio Antônio Serra de Lima Junior | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import requests | |
import justext | |
import sys | |
import re | |
label_header_app_name = "REQUEST: REsearch QUEry STrategy\n" \ | |
"Intersect MeSH descriptors for PubMed, Web of Science and CENTRAL.\n" \ | |
"(c) 2019 David Cesarino (davidcesarino@gmail.com)\n" \ | |
" Fábio Serra (fabioserrajr@outlook.com)" | |
label_cut_start_prefix = "-> Please copy your " | |
label_cut_start_suffix = " search strategy below:" | |
label_must_provide_descriptor = "Usage: request.py \"Descriptor 1\" [\"Descriptor 2\"] [\"Descriptor 3\"] [...]\n\n" \ | |
"Error: you must provide at least one descriptor as argument." | |
# Do not change anything past this line unless you know what you're doing. | |
label_pubmed = "PubMed" | |
label_wos = "Web of Science" | |
label_central = "CENTRAL" | |
mesh_descriptor_uri_prefix = "https://www.ncbi.nlm.nih.gov/mesh/?term=" | |
mesh_content_delimiter_enter = ['Entry Terms:'] | |
mesh_content_delimiter_exit = ['All MeSH Categories', 'Previous Indexing:'] | |
regex_comma_with_optional_spaces = re.compile(' *, *') | |
near2_with_spaces_lower_case = ' near/2 ' | |
near2_with_spaces_isolated = '" NEAR/2 "' | |
query_type_pubmed = 1 | |
query_type_wos = 2 | |
def is_args_valid(): | |
return len(sys.argv) > 1 | |
def validate_arguments(): | |
if not is_args_valid(): | |
print_header() | |
print("") | |
print(label_must_provide_descriptor) | |
exit(1) | |
def print_header(): | |
print(label_header_app_name) | |
def print_query(service_name, query): | |
print(label_cut_start_prefix + service_name + label_cut_start_suffix) | |
print(query) | |
def mesh_is_delimiter_enter_reached(match_str): | |
for match_try in mesh_content_delimiter_enter: | |
if match_try in match_str: | |
return True | |
return False | |
def mesh_is_delimiter_exit_reached(match_str): | |
for match_try in mesh_content_delimiter_exit: | |
if match_try in match_str: | |
return True | |
return False | |
def mesh_get_descriptor_entry_terms(descriptor): | |
response = requests.get(mesh_descriptor_uri_prefix + descriptor) | |
paragraphs = justext.justext(response.content, justext.get_stoplist("English")) | |
found = False | |
names = [] | |
for p in paragraphs: | |
match_str = p.text | |
if mesh_is_delimiter_exit_reached(match_str): | |
break | |
elif mesh_is_delimiter_enter_reached(match_str): | |
found = True | |
continue | |
elif found: | |
names.append(match_str) | |
continue | |
return names | |
def mesh_get_descriptors_with_entry_terms(): | |
descriptors_with_entry_terms = [] | |
skip_script_call = True | |
for descriptor in sys.argv: | |
if skip_script_call: | |
skip_script_call = False | |
continue | |
entry_terms = mesh_get_descriptor_entry_terms(descriptor) | |
descriptors_with_entry_terms.append((descriptor, entry_terms)) | |
return descriptors_with_entry_terms | |
def get_comma_free(entry, isolated_joins): | |
joiner = near2_with_spaces_isolated if isolated_joins else near2_with_spaces_lower_case | |
if "," in entry: | |
replacement = re.sub(regex_comma_with_optional_spaces, joiner, entry) | |
return '("' + replacement + '")' if isolated_joins else replacement | |
else: | |
return '"' + entry + '"' if isolated_joins else entry | |
def get_li(line): | |
return "#" + str(line) + " " | |
def get_item(line): | |
return "#" + str(line) | |
def central_get_query(descriptors_with_entry_terms): | |
line = 0 | |
root_lines = [] | |
result = '' | |
for entry in descriptors_with_entry_terms: | |
line += 1 | |
child_line = line | |
result += get_li(line) + "MeSH descriptor: [" + entry[0] + "] explode all trees" + "\n" | |
for entry_term in entry[1]: | |
line += 1 | |
result += get_li(line) + get_comma_free(entry_term, False) + "\n" | |
line += 1 | |
root_lines.append(line) | |
result += get_li(line) | |
refs_to_join = [] | |
for i in xrange(child_line, line): | |
refs_to_join.append(get_item(i)) | |
result += " or ".join(str(line_reference) for line_reference in refs_to_join) + "\n" | |
line += 1 | |
result += get_li(line) + " and ".join(get_item(root_reference) for root_reference in root_lines) | |
return result | |
def pubmed_get_descriptor_query(descriptor, entry_terms): | |
all_terms = [descriptor] + entry_terms | |
pubmed_query = '(' | |
pubmed_query += ' OR '.join(str("\"" + entry + "\"") for entry in all_terms) | |
pubmed_query += ')' | |
return pubmed_query | |
def wos_get_descriptor_query(descriptor, entry_terms): | |
all_terms = [descriptor] + entry_terms | |
wos_query = "TS=(" | |
wos_query += ' OR '.join(str(get_comma_free(entry, True)) for entry in all_terms) | |
wos_query += ')' | |
return wos_query | |
def get_descriptor_query(query_type, descriptor_with_entry_terms): | |
if query_type == query_type_pubmed: | |
return pubmed_get_descriptor_query(descriptor_with_entry_terms[0], descriptor_with_entry_terms[1]) | |
elif query_type == query_type_wos: | |
return wos_get_descriptor_query(descriptor_with_entry_terms[0], descriptor_with_entry_terms[1]) | |
else: | |
raise Exception("[BUG] Invalid query type=[" + query_type + "]") | |
def get_joined_query(all_descriptors_queries): | |
return ' AND '.join(str(descriptor_query) for descriptor_query in all_descriptors_queries) | |
def get_query(query_type, descriptors_with_entry_terms): | |
individual_queries = [] | |
for descriptor_with_entry_terms in descriptors_with_entry_terms: | |
individual_queries.append(get_descriptor_query(query_type, descriptor_with_entry_terms)) | |
return get_joined_query(individual_queries) | |
def main(): | |
validate_arguments() | |
descriptors_with_entry_terms = mesh_get_descriptors_with_entry_terms() | |
pubmed_query = get_query(query_type_pubmed, descriptors_with_entry_terms) | |
wos_query = get_query(query_type_wos, descriptors_with_entry_terms) | |
central_query = central_get_query(descriptors_with_entry_terms) | |
print_query(label_pubmed, pubmed_query) | |
print("") | |
print_query(label_wos, wos_query) | |
print("") | |
print_query(label_central, central_query) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment