Skip to content

Instantly share code, notes, and snippets.

@shapiromatron
Last active September 3, 2019 11:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shapiromatron/e46748a618f81b4319b83ccd789e92db to your computer and use it in GitHub Desktop.
Save shapiromatron/e46748a618f81b4319b83ccd789e92db to your computer and use it in GitHub Desktop.
eparm data fetcher
"""
To run, ensure you have python3.6 or higher in your path. Then follow these steps:
```python
# create a new virtual environment in the current path named "venv"
python -m venv venv
# activate newly created virtual environment
source ./venv/bin/activate # mac/linux
.\venv\Scripts\activate # windows
# install required packages
pip install -U pip
pip install pandas requests-html
# call script
python ./eparm.py
```
"""
from collections import defaultdict
from pathlib import Path
import pandas as pd
import requests_html
webpage_files = ["TCEQ_site1_page1.htm", "TCEQ_site1_page2.htm", "TCEQ_site1_page3.htm"]
data = defaultdict(list)
for webpage_file in webpage_files:
webpage = requests_html.HTML(html=Path(webpage_file).read_text())
data_elements = webpage.find(".rgMasterTable > tbody > tr")
for el in data_elements:
classes = set(el.attrs.get("class", []))
if "rgRow" in classes or "rgAltRow" in classes:
main_table_texts = [td.text for td in el.find("td")]
main_table_texts.insert(0, el.attrs["id"])
else:
for row in el.find(".rgDetailTable > tbody > tr"):
sub_table_texts = [td.text for td in row.find("td")]
sub_table_texts.insert(0, row.attrs["id"])
data["filename"].append(webpage_file)
# main table
data["Sample ID"].append(main_table_texts[0])
data["Site No"].append(main_table_texts[2])
data["Sample No"].append(main_table_texts[3])
data["Sample Date"].append(main_table_texts[4])
data["Sample Time"].append(main_table_texts[5])
data["Location"].append(main_table_texts[6])
data["Property ID"].append(main_table_texts[7])
data["Matrix"].append(main_table_texts[8])
data["Latitude"].append(main_table_texts[9])
data["Longitude"].append(main_table_texts[10])
# secondary table
data["Sample Analyte ID"].append(sub_table_texts[0])
data["Sample Analyte No"].append(sub_table_texts[1])
data["Sample Analyte"].append(sub_table_texts[2])
data["Sample Analyte Result"].append(sub_table_texts[3])
data["Sample Analyte Result Unit"].append(sub_table_texts[4])
data["Sample Analyte Result Qualifier"].append(sub_table_texts[5])
data["Sample Analyte Analysis"].append(sub_table_texts[6])
data["Sample Analyte Total (T)/Dissolved (D)"].append(
sub_table_texts[7]
)
df = pd.DataFrame(data=data)
write_filename = Path("~/Desktop/data.csv").expanduser().absolute()
print(f"Writing file to {write_filename}")
df.to_csv(write_filename, index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment