CloudCray/comp_manager_entry_list_to_csv.py

## comp_manager_entry_list_to_csv.py
# Python 3.4

import urllib.request as r
import urllib.parse as p
import gzip
import io
import bs4
import os
import csv

url = "http://www.compmngr.com/michcomp15/MichComp15_EntryLists.htm"

def get_page_text(url, referer=None, header=[], data=None):
    req = r.Request(url)
    for k in HEADERS.keys():
                req.add_header(k, HEADERS[k])
    for h in COOKIES:
                req.add_header(h[0], h[1])
    for h in header:
                req.add_header(h[0], h[1])
    if referer:
                req.add_header("Referer", referer)
    if data:
        resp = r.urlopen(req, data)
    else:
        resp = r.urlopen(req)
    if resp.getheader("Set-Cookie"):
            COOKIES.append(("Cookie", resp.getheader("Set-Cookie")))
    enc = resp.headers.get("Content-Encoding")
    output = None
    if enc:
        if enc.upper() == "GZIP":
            bi = io.BytesIO(resp.read())
            gf = gzip.GzipFile(fileobj=bi, mode="rb")
            output = gf.read()
        else:
            output = resp.read()
    else:
        output = resp.read()
    if type(output) is bytes:
        output = output.decode("ascii", "replace")
    return output

HEADERS = {
    "Host": "www.compmngr.com",
    "Connection": "keep-alive",
    "Accept": r"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36",
    "Accept-Encoding": "gzip,deflate,sdch",
    "Accept-Language": "en-US,en;q=0.8"}

COOKIES = []

page_text = get_page_text(url)
page_soup = bs4.BeautifulSoup(page_text)

divs = page_soup.findChildren("div")
div_tables = [x for x in divs if "TABLE" in x["id"]]

events = []

for div in div_tables:
    table = div.find("table")
    if table:
        for item in table.findAll("td"):
            text = item.text
            if not text == "Event" and text not in events:
                events.append(text)

entries = {x: [] for x in events}
for event in events:
    entrants = [y for y in divs if event in y.text]
    for x in entrants:
        persons = x.findChildren("strong")
        person_1 = persons[0].text.replace("Entries for ", "")
        person_2 = persons[1].text.replace("With ","")
        entries[event].append((person_1, person_2))

output_as_list = []

for k in entries:
    for couple in entries[k]:
        out_rec = [k, couple[0], couple[1]]
        output_as_list.append(out_rec)

filename = url.split("/")[-1].replace(".", "_") + ".csv"

out_file = open(filename, "w", newline="\n")
writer = csv.writer(out_file)
writer.writerow(["Event", "Lead", "Follow"])
for row in output_as_list:
    writer.writerow(row)
out_file.close()

## comp_manager_scraper_event.py
# Python 3.4

import urllib.request as r
import urllib.parse as p
import gzip
import io
import bs4
import os

entrants = [x for x in divs if "AC- Adult (19+) Amateur International Ballroom Championships (W/T/VW/F/Q)" in x.text]
url = r"http://www.compmngr.com/manhattan2014/Manhattan2014_HeatLists.htm"


def get_page_text(url, referer=None, header=[], data=None):
    req = r.Request(url)
    for k in HEADERS.keys():
                req.add_header(k, HEADERS[k])
    for h in COOKIES:
                req.add_header(h[0], h[1])
    for h in header:
                req.add_header(h[0], h[1])
    if referer:
                req.add_header("Referer", referer)
    if data:
        resp = r.urlopen(req, data)
    else:
        resp = r.urlopen(req)
    if resp.getheader("Set-Cookie"):
            COOKIES.append(("Cookie", resp.getheader("Set-Cookie")))
    enc = resp.headers.get("Content-Encoding")
    output = None
    if enc:
        if enc.upper() == "GZIP":
            bi = io.BytesIO(resp.read())
            gf = gzip.GzipFile(fileobj=bi, mode="rb")
            output = gf.read()
        else:
            output = resp.read()
    else:
        output = resp.read()
    if type(output) is bytes:
        output = output.decode("ascii", "replace")
    return output

HEADERS = {
    "Host": "www.compmngr.com",
    "Connection": "keep-alive",
    "Accept": r"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36",
    "Accept-Encoding": "gzip,deflate,sdch",
    "Accept-Language": "en-US,en;q=0.8"}

COOKIES = []

page_text = get_page_text(url)
page_soup = bs4.BeautifulSoup(page_text)

divs = page_soup.findChildren("div")

competitors = {}
for x in entrants:
    persons = x.findChildren("strong")
    person_1 = persons[0].text.replace("Entries for ", "")
    person_2 = persons[1].text.replace("With ","")
    if competitors.get(person_2) is None:
        competitors[person_1] = person_2

for x in competitors:
    print(x + " and " + competitors[x])
	# Python 3.4

	import urllib.request as r
	import urllib.parse as p
	import gzip
	import io
	import bs4
	import os
	import csv

	url = "http://www.compmngr.com/michcomp15/MichComp15_EntryLists.htm"

	def get_page_text(url, referer=None, header=[], data=None):
	req = r.Request(url)
	for k in HEADERS.keys():
	req.add_header(k, HEADERS[k])
	for h in COOKIES:
	req.add_header(h[0], h[1])
	for h in header:
	req.add_header(h[0], h[1])
	if referer:
	req.add_header("Referer", referer)
	if data:
	resp = r.urlopen(req, data)
	else:
	resp = r.urlopen(req)
	if resp.getheader("Set-Cookie"):
	COOKIES.append(("Cookie", resp.getheader("Set-Cookie")))
	enc = resp.headers.get("Content-Encoding")
	output = None
	if enc:
	if enc.upper() == "GZIP":
	bi = io.BytesIO(resp.read())
	gf = gzip.GzipFile(fileobj=bi, mode="rb")
	output = gf.read()
	else:
	output = resp.read()
	else:
	output = resp.read()
	if type(output) is bytes:
	output = output.decode("ascii", "replace")
	return output

	HEADERS = {
	"Host": "www.compmngr.com",
	"Connection": "keep-alive",
	"Accept": r"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8",
	"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36",
	"Accept-Encoding": "gzip,deflate,sdch",
	"Accept-Language": "en-US,en;q=0.8"}

	COOKIES = []

	page_text = get_page_text(url)
	page_soup = bs4.BeautifulSoup(page_text)

	divs = page_soup.findChildren("div")
	div_tables = [x for x in divs if "TABLE" in x["id"]]

	events = []

	for div in div_tables:
	table = div.find("table")
	if table:
	for item in table.findAll("td"):
	text = item.text
	if not text == "Event" and text not in events:
	events.append(text)

	entries = {x: [] for x in events}
	for event in events:
	entrants = [y for y in divs if event in y.text]
	for x in entrants:
	persons = x.findChildren("strong")
	person_1 = persons[0].text.replace("Entries for ", "")
	person_2 = persons[1].text.replace("With ","")
	entries[event].append((person_1, person_2))

	output_as_list = []

	for k in entries:
	for couple in entries[k]:
	out_rec = [k, couple[0], couple[1]]
	output_as_list.append(out_rec)

	filename = url.split("/")[-1].replace(".", "_") + ".csv"

	out_file = open(filename, "w", newline="\n")
	writer = csv.writer(out_file)
	writer.writerow(["Event", "Lead", "Follow"])
	for row in output_as_list:
	writer.writerow(row)
	out_file.close()