Skip to content

Instantly share code, notes, and snippets.

@Nanguage
Created August 25, 2021 08:17
Show Gist options
  • Save Nanguage/a0b3f156df2034d381c260352131e4cc to your computer and use it in GitHub Desktop.
Save Nanguage/a0b3f156df2034d381c260352131e4cc to your computer and use it in GitHub Desktop.
crawl nature's open access articles html and peer review file links
import typing as t
from pathlib import Path
import fire
import mechanicalsoup
from bs4 import BeautifulSoup
from tqdm import tqdm
browser = mechanicalsoup.StatefulBrowser()
base_urls = {
"Nature": "https://www.nature.com"
}
page_temps = {
"NC": "{base_url}/ncomms/research-articles?searchType=journalSearch&sort=PubDate&page={page_id}"
}
def get_article_urls(page, base=base_urls['Nature'], item_class="c-card__link u-link-inherit"):
urls_relative = [a.attrs['href'] for a in page.find_all("a", class_=item_class)]
urls = [base + u for u in urls_relative]
return urls
def fetch_article_url_list(page_range="1-100", base=base_urls['Nature'], temp=page_temps['NC']):
start, end = [int(i) for i in page_range.split('-')]
prange = range(start, end+1)
urls = []
for page_id in prange:
page_url = temp.format(base_url=base, page_id=page_id)
browser.open(page_url)
page = browser.page
urls_page = get_article_urls(page, base)
urls += urls_page
for u in urls_page:
print(u)
return urls
def crawl_articles(urls: t.Union[str, t.List[str]], store_dir="./articles"):
"""Crawl article pages.
:param urls: Path to file which store article urls, or a list of article urls.
:param store_dir: Path to directory for store results.
"""
store_path = Path(store_dir)
if not store_path.exists():
store_path.mkdir()
if type(urls) is str:
with open(urls) as f:
urls = [line.strip() for line in f]
for url in tqdm(urls):
browser.open(url)
page = browser.page
page_id = url.strip(".html").split('/')[-1]
out_f = store_path / (page_id + ".html")
with open(out_f, 'w') as f:
f.write(str(page))
def extract_peer_review_links(html_dir: str):
dir_html = Path(html_dir)
htmls = dir_html.glob("*.html")
for path in htmls:
with open(path) as f:
content = f.read()
soup = BeautifulSoup(content, "html.parser")
pv_links = soup.find_all("a", string="Peer Review File")
if len(pv_links) == 0:
continue
pv_link = pv_links[0].attrs['href']
page_id = str(path).strip(".html").split('/')[-1]
print(page_id, pv_link, sep="\t")
if __name__ == "__main__":
fire.Fire()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment