Last active
October 6, 2019 23:18
-
-
Save jgensler8/3a1f2103e94ddc2f6b8292fae5fc2bcc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from util import pagename | |
def download_page(pagenum): | |
page = requests.get("https://www.ncagr.gov/NCproducts/directory.asp?page={pagenumber}&CatNum=1011&SubCatNum=11&SearchPhrase=&SearchType=Products&SortBy=&".format(pagenumber=pagenum)) | |
with open(pagename(pagenum), "w") as f: | |
f.write(page.content) | |
for pagenum in [1,2,3]: | |
download_page(pagenum) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import html | |
from util import pagename, csvname | |
def parse_page(pagenum): | |
with open(pagename(pagenum)) as f: | |
page = html.fromstring(f.read()) | |
# xpath to | |
tr = page.xpath('//*/tr') | |
print(tr) | |
with open(csvname(pagenum), "w") as csvfile: | |
for td in tr: | |
for p in td.xpath('td/div/p'): | |
content = p.text_content() | |
try: | |
lines = content.split('\n') | |
# turn body of text into elements of list. use != "" to ignore extra lines | |
fields = [l.strip() for l in lines if l.strip() != ""] | |
# skip content without any lines | |
if not fields: | |
continue | |
# replace hyperlink and convert to utf-8 | |
fields = [f.replace(u"\xa0", " ").encode("utf-8") for f in fields] | |
# "unpack" fields into variables | |
# (name, street, state, county, phone) = fields | |
csvfile.write(",".join(fields)) | |
csvfile.write("\n") | |
except ValueError as e: | |
print(e) | |
pass | |
for pagenum in [1,2,3]: | |
parse_page(pagenum) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def pagename(pagenum): | |
return "page_{}.html".format(pagenum) | |
def csvname(pagenum): | |
return "page_{}.csv".format(pagenum) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment