Created
February 25, 2015 16:01
-
-
Save bcbwilla/c637c4304eb28cab76ad to your computer and use it in GitHub Desktop.
amcsd scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Extract the CIF files for each mineral in the | |
American Minerologist Crystal Structure Database | |
http://rruff.geo.arizona.edu/AMS/ | |
Puts files in 'amcsd_data' folder. | |
""" | |
import requests | |
import string | |
import urllib | |
import time | |
import os | |
from bs4 import BeautifulSoup | |
base_url = "http://rruff.geo.arizona.edu/AMS/" | |
data_folder = 'amcsd_data' | |
if not os.path.exists(data_folder): | |
os.makedirs(data_folder) | |
def get_mineral_names(): | |
""" Return names of all minerals in database """ | |
mineral_url = base_url + "index_min.php?letter=" | |
mineral_names = [] | |
for letter in string.ascii_lowercase: | |
r = requests.get(mineral_url + letter) | |
if r.status_code == 200: | |
soup = BeautifulSoup(r.content) | |
else: | |
continue | |
for mineral in soup.find_all("td", {"width": "20%"}): | |
name_list = mineral.a.contents | |
if len(name_list) == 1: | |
mineral_names.append(name_list[0]) | |
time.sleep(0.1) | |
return mineral_names | |
def get_cif_files(mineral_name): | |
""" Get all the CIF files for the corresponding mineral name """ | |
post_url = base_url + "result.php" | |
r = requests.post(post_url, data={'Mineral': mineral_name}) | |
soup = BeautifulSoup(r.content) | |
links = soup.find_all('a', href=True) | |
mineral_links = [] | |
for link in links: | |
href = link['href'] | |
if "/AMS/download.php?id=" in href and ".cif&down=cif" in href: | |
mineral_links.append(href[5:]) | |
for i, mineral_link in enumerate(mineral_links): | |
file_name = mineral_name + '_' + str(i).zfill(2) + '.cif' | |
file_name = os.path.join(data_folder, file_name) | |
urllib.urlretrieve(base_url+mineral_link, file_name) | |
time.sleep(0.2) | |
print("Getting mineral names") | |
mineral_names = get_mineral_names() | |
print("Getting CIF files for each mineral") | |
n_minerals = len(mineral_names) | |
for i, mineral_name in enumerate(mineral_names): | |
get_cif_files(mineral_name) | |
if i % 25 == 0: | |
print "%s/%s done." % (str(i).zfill(4),n_minerals) | |
print("Done") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment