Skip to content

Instantly share code, notes, and snippets.

@bcbwilla
Created February 25, 2015 16:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bcbwilla/c637c4304eb28cab76ad to your computer and use it in GitHub Desktop.
Save bcbwilla/c637c4304eb28cab76ad to your computer and use it in GitHub Desktop.
amcsd scraper
""" Extract the CIF files for each mineral in the
American Minerologist Crystal Structure Database
http://rruff.geo.arizona.edu/AMS/
Puts files in 'amcsd_data' folder.
"""
import requests
import string
import urllib
import time
import os
from bs4 import BeautifulSoup
base_url = "http://rruff.geo.arizona.edu/AMS/"
data_folder = 'amcsd_data'
if not os.path.exists(data_folder):
os.makedirs(data_folder)
def get_mineral_names():
""" Return names of all minerals in database """
mineral_url = base_url + "index_min.php?letter="
mineral_names = []
for letter in string.ascii_lowercase:
r = requests.get(mineral_url + letter)
if r.status_code == 200:
soup = BeautifulSoup(r.content)
else:
continue
for mineral in soup.find_all("td", {"width": "20%"}):
name_list = mineral.a.contents
if len(name_list) == 1:
mineral_names.append(name_list[0])
time.sleep(0.1)
return mineral_names
def get_cif_files(mineral_name):
""" Get all the CIF files for the corresponding mineral name """
post_url = base_url + "result.php"
r = requests.post(post_url, data={'Mineral': mineral_name})
soup = BeautifulSoup(r.content)
links = soup.find_all('a', href=True)
mineral_links = []
for link in links:
href = link['href']
if "/AMS/download.php?id=" in href and ".cif&down=cif" in href:
mineral_links.append(href[5:])
for i, mineral_link in enumerate(mineral_links):
file_name = mineral_name + '_' + str(i).zfill(2) + '.cif'
file_name = os.path.join(data_folder, file_name)
urllib.urlretrieve(base_url+mineral_link, file_name)
time.sleep(0.2)
print("Getting mineral names")
mineral_names = get_mineral_names()
print("Getting CIF files for each mineral")
n_minerals = len(mineral_names)
for i, mineral_name in enumerate(mineral_names):
get_cif_files(mineral_name)
if i % 25 == 0:
print "%s/%s done." % (str(i).zfill(4),n_minerals)
print("Done")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment