Nanguage/crawl_nature.py

## crawl_nature.py
import typing as t
from pathlib import Path
import fire
import mechanicalsoup
from bs4 import BeautifulSoup
from tqdm import tqdm

browser = mechanicalsoup.StatefulBrowser()

base_urls = {
    "Nature": "https://www.nature.com"
}
page_temps = {
    "NC": "{base_url}/ncomms/research-articles?searchType=journalSearch&sort=PubDate&page={page_id}"
}

def get_article_urls(page, base=base_urls['Nature'], item_class="c-card__link u-link-inherit"):
    urls_relative = [a.attrs['href'] for a in page.find_all("a", class_=item_class)]
    urls = [base + u for u in urls_relative]
    return urls

def fetch_article_url_list(page_range="1-100", base=base_urls['Nature'], temp=page_temps['NC']):
    start, end = [int(i) for i in page_range.split('-')]
    prange = range(start, end+1)
    urls = []
    for page_id in prange:
        page_url = temp.format(base_url=base, page_id=page_id)
        browser.open(page_url)
        page = browser.page
        urls_page = get_article_urls(page, base)
        urls += urls_page
        for u in urls_page:
            print(u)
    return urls

def crawl_articles(urls: t.Union[str, t.List[str]], store_dir="./articles"):
    """Crawl article pages.

    :param urls: Path to file which store article urls, or a list of article urls.
    :param store_dir: Path to directory for store results.
    """
    store_path = Path(store_dir)
    if not store_path.exists():
        store_path.mkdir()
    if type(urls) is str:
        with open(urls) as f:
            urls = [line.strip() for line in f]
    for url in tqdm(urls):
        browser.open(url)
        page = browser.page
        page_id = url.strip(".html").split('/')[-1]
        out_f = store_path / (page_id + ".html")
        with open(out_f, 'w') as f:
            f.write(str(page))

def extract_peer_review_links(html_dir: str):
    dir_html = Path(html_dir)
    htmls = dir_html.glob("*.html")
    for path in htmls:
        with open(path) as f:
            content = f.read()
        soup = BeautifulSoup(content, "html.parser")
        pv_links = soup.find_all("a", string="Peer Review File")
        if len(pv_links) == 0:
            continue
        pv_link = pv_links[0].attrs['href']
        page_id = str(path).strip(".html").split('/')[-1]
        print(page_id, pv_link, sep="\t")


if __name__ == "__main__":
    fire.Fire()
	import typing as t
	from pathlib import Path
	import fire
	import mechanicalsoup
	from bs4 import BeautifulSoup
	from tqdm import tqdm

	browser = mechanicalsoup.StatefulBrowser()

	base_urls = {
	"Nature": "https://www.nature.com"
	}
	page_temps = {
	"NC": "{base_url}/ncomms/research-articles?searchType=journalSearch&sort=PubDate&page={page_id}"
	}

	def get_article_urls(page, base=base_urls['Nature'], item_class="c-card__link u-link-inherit"):
	urls_relative = [a.attrs['href'] for a in page.find_all("a", class_=item_class)]
	urls = [base + u for u in urls_relative]
	return urls

	def fetch_article_url_list(page_range="1-100", base=base_urls['Nature'], temp=page_temps['NC']):
	start, end = [int(i) for i in page_range.split('-')]
	prange = range(start, end+1)
	urls = []
	for page_id in prange:
	page_url = temp.format(base_url=base, page_id=page_id)
	browser.open(page_url)
	page = browser.page
	urls_page = get_article_urls(page, base)
	urls += urls_page
	for u in urls_page:
	print(u)
	return urls

	def crawl_articles(urls: t.Union[str, t.List[str]], store_dir="./articles"):
	"""Crawl article pages.

	:param urls: Path to file which store article urls, or a list of article urls.
	:param store_dir: Path to directory for store results.
	"""
	store_path = Path(store_dir)
	if not store_path.exists():
	store_path.mkdir()
	if type(urls) is str:
	with open(urls) as f:
	urls = [line.strip() for line in f]
	for url in tqdm(urls):
	browser.open(url)
	page = browser.page
	page_id = url.strip(".html").split('/')[-1]
	out_f = store_path / (page_id + ".html")
	with open(out_f, 'w') as f:
	f.write(str(page))

	def extract_peer_review_links(html_dir: str):
	dir_html = Path(html_dir)
	htmls = dir_html.glob("*.html")
	for path in htmls:
	with open(path) as f:
	content = f.read()
	soup = BeautifulSoup(content, "html.parser")
	pv_links = soup.find_all("a", string="Peer Review File")
	if len(pv_links) == 0:
	continue
	pv_link = pv_links[0].attrs['href']
	page_id = str(path).strip(".html").split('/')[-1]
	print(page_id, pv_link, sep="\t")


	if __name__ == "__main__":
	fire.Fire()