-
-
Save shapiromatron/e46748a618f81b4319b83ccd789e92db to your computer and use it in GitHub Desktop.
eparm data fetcher
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
To run, ensure you have python3.6 or higher in your path. Then follow these steps: | |
```python | |
# create a new virtual environment in the current path named "venv" | |
python -m venv venv | |
# activate newly created virtual environment | |
source ./venv/bin/activate # mac/linux | |
.\venv\Scripts\activate # windows | |
# install required packages | |
pip install -U pip | |
pip install pandas requests-html | |
# call script | |
python ./eparm.py | |
``` | |
""" | |
from collections import defaultdict | |
from pathlib import Path | |
import pandas as pd | |
import requests_html | |
webpage_files = ["TCEQ_site1_page1.htm", "TCEQ_site1_page2.htm", "TCEQ_site1_page3.htm"] | |
data = defaultdict(list) | |
for webpage_file in webpage_files: | |
webpage = requests_html.HTML(html=Path(webpage_file).read_text()) | |
data_elements = webpage.find(".rgMasterTable > tbody > tr") | |
for el in data_elements: | |
classes = set(el.attrs.get("class", [])) | |
if "rgRow" in classes or "rgAltRow" in classes: | |
main_table_texts = [td.text for td in el.find("td")] | |
main_table_texts.insert(0, el.attrs["id"]) | |
else: | |
for row in el.find(".rgDetailTable > tbody > tr"): | |
sub_table_texts = [td.text for td in row.find("td")] | |
sub_table_texts.insert(0, row.attrs["id"]) | |
data["filename"].append(webpage_file) | |
# main table | |
data["Sample ID"].append(main_table_texts[0]) | |
data["Site No"].append(main_table_texts[2]) | |
data["Sample No"].append(main_table_texts[3]) | |
data["Sample Date"].append(main_table_texts[4]) | |
data["Sample Time"].append(main_table_texts[5]) | |
data["Location"].append(main_table_texts[6]) | |
data["Property ID"].append(main_table_texts[7]) | |
data["Matrix"].append(main_table_texts[8]) | |
data["Latitude"].append(main_table_texts[9]) | |
data["Longitude"].append(main_table_texts[10]) | |
# secondary table | |
data["Sample Analyte ID"].append(sub_table_texts[0]) | |
data["Sample Analyte No"].append(sub_table_texts[1]) | |
data["Sample Analyte"].append(sub_table_texts[2]) | |
data["Sample Analyte Result"].append(sub_table_texts[3]) | |
data["Sample Analyte Result Unit"].append(sub_table_texts[4]) | |
data["Sample Analyte Result Qualifier"].append(sub_table_texts[5]) | |
data["Sample Analyte Analysis"].append(sub_table_texts[6]) | |
data["Sample Analyte Total (T)/Dissolved (D)"].append( | |
sub_table_texts[7] | |
) | |
df = pd.DataFrame(data=data) | |
write_filename = Path("~/Desktop/data.csv").expanduser().absolute() | |
print(f"Writing file to {write_filename}") | |
df.to_csv(write_filename, index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment