Skip to content

Instantly share code, notes, and snippets.

@CloudCray
Last active August 29, 2015 14:03
Show Gist options
  • Save CloudCray/d93d3d72ff51157a87df to your computer and use it in GitHub Desktop.
Save CloudCray/d93d3d72ff51157a87df to your computer and use it in GitHub Desktop.
Comp Manager Entry List Scraper - Single Event
# Python 3.4
import urllib.request as r
import urllib.parse as p
import gzip
import io
import bs4
import os
import csv
url = "http://www.compmngr.com/michcomp15/MichComp15_EntryLists.htm"
def get_page_text(url, referer=None, header=[], data=None):
req = r.Request(url)
for k in HEADERS.keys():
req.add_header(k, HEADERS[k])
for h in COOKIES:
req.add_header(h[0], h[1])
for h in header:
req.add_header(h[0], h[1])
if referer:
req.add_header("Referer", referer)
if data:
resp = r.urlopen(req, data)
else:
resp = r.urlopen(req)
if resp.getheader("Set-Cookie"):
COOKIES.append(("Cookie", resp.getheader("Set-Cookie")))
enc = resp.headers.get("Content-Encoding")
output = None
if enc:
if enc.upper() == "GZIP":
bi = io.BytesIO(resp.read())
gf = gzip.GzipFile(fileobj=bi, mode="rb")
output = gf.read()
else:
output = resp.read()
else:
output = resp.read()
if type(output) is bytes:
output = output.decode("ascii", "replace")
return output
HEADERS = {
"Host": "www.compmngr.com",
"Connection": "keep-alive",
"Accept": r"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36",
"Accept-Encoding": "gzip,deflate,sdch",
"Accept-Language": "en-US,en;q=0.8"}
COOKIES = []
page_text = get_page_text(url)
page_soup = bs4.BeautifulSoup(page_text)
divs = page_soup.findChildren("div")
div_tables = [x for x in divs if "TABLE" in x["id"]]
events = []
for div in div_tables:
table = div.find("table")
if table:
for item in table.findAll("td"):
text = item.text
if not text == "Event" and text not in events:
events.append(text)
entries = {x: [] for x in events}
for event in events:
entrants = [y for y in divs if event in y.text]
for x in entrants:
persons = x.findChildren("strong")
person_1 = persons[0].text.replace("Entries for ", "")
person_2 = persons[1].text.replace("With ","")
entries[event].append((person_1, person_2))
output_as_list = []
for k in entries:
for couple in entries[k]:
out_rec = [k, couple[0], couple[1]]
output_as_list.append(out_rec)
filename = url.split("/")[-1].replace(".", "_") + ".csv"
out_file = open(filename, "w", newline="\n")
writer = csv.writer(out_file)
writer.writerow(["Event", "Lead", "Follow"])
for row in output_as_list:
writer.writerow(row)
out_file.close()
# Python 3.4
import urllib.request as r
import urllib.parse as p
import gzip
import io
import bs4
import os
entrants = [x for x in divs if "AC- Adult (19+) Amateur International Ballroom Championships (W/T/VW/F/Q)" in x.text]
url = r"http://www.compmngr.com/manhattan2014/Manhattan2014_HeatLists.htm"
def get_page_text(url, referer=None, header=[], data=None):
req = r.Request(url)
for k in HEADERS.keys():
req.add_header(k, HEADERS[k])
for h in COOKIES:
req.add_header(h[0], h[1])
for h in header:
req.add_header(h[0], h[1])
if referer:
req.add_header("Referer", referer)
if data:
resp = r.urlopen(req, data)
else:
resp = r.urlopen(req)
if resp.getheader("Set-Cookie"):
COOKIES.append(("Cookie", resp.getheader("Set-Cookie")))
enc = resp.headers.get("Content-Encoding")
output = None
if enc:
if enc.upper() == "GZIP":
bi = io.BytesIO(resp.read())
gf = gzip.GzipFile(fileobj=bi, mode="rb")
output = gf.read()
else:
output = resp.read()
else:
output = resp.read()
if type(output) is bytes:
output = output.decode("ascii", "replace")
return output
HEADERS = {
"Host": "www.compmngr.com",
"Connection": "keep-alive",
"Accept": r"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36",
"Accept-Encoding": "gzip,deflate,sdch",
"Accept-Language": "en-US,en;q=0.8"}
COOKIES = []
page_text = get_page_text(url)
page_soup = bs4.BeautifulSoup(page_text)
divs = page_soup.findChildren("div")
competitors = {}
for x in entrants:
persons = x.findChildren("strong")
person_1 = persons[0].text.replace("Entries for ", "")
person_2 = persons[1].text.replace("With ","")
if competitors.get(person_2) is None:
competitors[person_1] = person_2
for x in competitors:
print(x + " and " + competitors[x])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment