Skip to content

Instantly share code, notes, and snippets.

@kilomeow
Created February 25, 2022 15:28
Show Gist options
  • Save kilomeow/bd3a02bfb0542dfa0ff47bbd7fa86d67 to your computer and use it in GitHub Desktop.
Save kilomeow/bd3a02bfb0542dfa0ff47bbd7fa86d67 to your computer and use it in GitHub Desktop.
import requests
target = "/modules.php?name=terr"
pgnum = "&pagenum="
userAgent = 'Mozilla/5.0 (X11; Linux i686; rv:5.0) Gecko/20101129 Firefox/36.0'
def cutNumber(s):
n = ""
for c in s:
if c.isnumeric():
n += c
elif n:
break
return int(n)
from html.parser import HTMLParser
class StreetsParser(HTMLParser):
def __init__(self):
super().__init__()
self.streets = list()
self.td_i = -1
self.capture = False
def handle_starttag(self, tag, attrs):
if tag == "td":
self.td_i += 1
if self.td_i >= 2 and self.td_i % 2 == 1:
self.capture = True
self.streets.append("")
def handle_endtag(self, tag):
if tag == "td":
self.capture = False
def handle_data(self, data):
if self.capture:
self.streets[-1] += data+"\n"
def extractStreets(source):
sc = source[source.find('divTerrList'):]
table_s = sc[sc.find('<table'):sc.find('</table')+8]
parser = StreetsParser()
parser.feed(table_s)
return parser.streets
def scrapeStreets(url):
url = url.rstrip('/')
basePageReq = requests.get(url+target, headers={'User-Agent': userAgent})
text = basePageReq.text
lastPageNum = cutNumber(text[text.rfind(pgnum):])
pages = [text]
for n in range(2, lastPageNum+1):
pageReq = requests.get(url+target+pgnum+str(n), headers={'User-Agent': userAgent})
pages.append(pageReq.text)
all_streets = list()
for p in pages:
all_streets.extend(extractStreets(p))
return all_streets
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment