Skip to content

Instantly share code, notes, and snippets.

@marcosfelt
Created March 16, 2023 20:53
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save marcosfelt/026b0f45fd3a448074d3afd09eb59530 to your computer and use it in GitHub Desktop.
Save marcosfelt/026b0f45fd3a448074d3afd09eb59530 to your computer and use it in GitHub Desktop.
Wikipedia Chemical Data Scraper
import logging
import re
from typing import Tuple, Union
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
wikipedia_base = "https://en.wikipedia.org"
logger = logging.getLogger(__name__)
def get_wikipedia_dipole_moments() -> pd.DataFrame:
"""Get dipole moments from wikipedia"""
# Site URL
url = "https://en.wikipedia.org/wiki/Glossary_of_chemical_formulae"
# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text
# Parse HTML code and get tables
soup = BeautifulSoup(html_content, "lxml")
tables = soup.find_all("table", attrs={"class": "wikitable sortable"})
logger.info(f"Found {len(tables)} tables")
# Get links from tables
results = []
try:
for table in tqdm(tables):
table_links = extract_table_links(table)
for link in tqdm(table_links, position=1, leave=False):
res = get_smiles_dipole_moment(link)
results.append(res)
except KeyboardInterrupt:
logger.info("Keyboard interrupt. Stopping...")
pass
df = pd.DataFrame(results, columns=["smiles", "dipole_moment"])
return df
def extract_table_links(table):
body = table.find_all("tr")
body_rows = body[1:]
links = []
for body_row in body_rows:
synonym = body_row.find_all("td")[1]
ahrefs = synonym.find_all("a")
if len(ahrefs) == 0:
continue
rel_link = ahrefs[0].get("href")
if rel_link:
links.append(wikipedia_base + rel_link)
return links
def get_smiles_dipole_moment(url: str) -> Tuple[Union[str, None], Union[str, None]]:
"""Get the dipole moment from a wikipedia chembox if it's there"""
# Get the HTML and parse it
html_content = requests.get(url).text
soup = BeautifulSoup(html_content, "lxml")
# Find chembox table
chemboxes = soup.find_all("table", attrs={"class": "infobox ib-chembox"})
if len(chemboxes) == 0:
return None, None
chembox = chemboxes[0]
body = chembox.find_all("tr")
body_rows = body[1:]
# Get table entries
rows = []
for body_row in body_rows:
row = [el.text.strip() for el in body_row.find_all("td")]
rows.append(row)
# Convert to dataframe and extract the values
df = pd.DataFrame(data=rows)
df = df.dropna(subset=[0])
smiles_row = df[df[0].str.contains("SMILES")]
mu_row = df[df[0] == "Dipole moment"]
# Clean
smiles: Union[str, None] = None
if smiles_row.shape[0] > 0:
smiles = smiles_row.iloc[0, 0].lstrip("SMILES\n") # type: ignore
mu = None
if mu_row.shape[0] > 0:
matches = re.match(r"^[\d.]+", mu_row[1].iloc[0])
if matches:
mu = matches[0]
return smiles, mu
if __name__ == "__main__":
get_wikipedia_dipole_moments()
@marcosfelt
Copy link
Author

Install dependencies:

pip install pandas tqdm requests beautifulsoup4

Then run

python wiki.py

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment