Skip to content

Instantly share code, notes, and snippets.

@jgensler8
Last active October 6, 2019 23:18
Show Gist options
  • Save jgensler8/3a1f2103e94ddc2f6b8292fae5fc2bcc to your computer and use it in GitHub Desktop.
Save jgensler8/3a1f2103e94ddc2f6b8292fae5fc2bcc to your computer and use it in GitHub Desktop.
import requests
from util import pagename
def download_page(pagenum):
page = requests.get("https://www.ncagr.gov/NCproducts/directory.asp?page={pagenumber}&CatNum=1011&SubCatNum=11&SearchPhrase=&SearchType=Products&SortBy=&".format(pagenumber=pagenum))
with open(pagename(pagenum), "w") as f:
f.write(page.content)
for pagenum in [1,2,3]:
download_page(pagenum)
from lxml import html
from util import pagename, csvname
def parse_page(pagenum):
with open(pagename(pagenum)) as f:
page = html.fromstring(f.read())
# xpath to
tr = page.xpath('//*/tr')
print(tr)
with open(csvname(pagenum), "w") as csvfile:
for td in tr:
for p in td.xpath('td/div/p'):
content = p.text_content()
try:
lines = content.split('\n')
# turn body of text into elements of list. use != "" to ignore extra lines
fields = [l.strip() for l in lines if l.strip() != ""]
# skip content without any lines
if not fields:
continue
# replace hyperlink and convert to utf-8
fields = [f.replace(u"\xa0", " ").encode("utf-8") for f in fields]
# "unpack" fields into variables
# (name, street, state, county, phone) = fields
csvfile.write(",".join(fields))
csvfile.write("\n")
except ValueError as e:
print(e)
pass
for pagenum in [1,2,3]:
parse_page(pagenum)
def pagename(pagenum):
return "page_{}.html".format(pagenum)
def csvname(pagenum):
return "page_{}.csv".format(pagenum)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment