Skip to content

Instantly share code, notes, and snippets.

@tomasbedrich
Last active November 11, 2015 19:01
Show Gist options
  • Save tomasbedrich/bebb0e47783525b70487 to your computer and use it in GitHub Desktop.
Save tomasbedrich/bebb0e47783525b70487 to your computer and use it in GitHub Desktop.
Web scraping in Python
#!/usr/bin/env python3
import sys
from requests import get
CRIME_CODE = 821 # = bribery
# make HTTP GET request for areas, parse them as JSON and access a list under a key "areas"
areas = get("http://mapakriminality.cz/api/areas", params={"level": 2}).json()["areas"]
# print each area's code and name
for area in areas:
print(area["Code"], area["Name"])
print("-" * 80)
# load desired area code from user
desired_area_code = input("Which area are you interested in (enter code): ")
# make HTTP GET request for crimes with more params, parse the response as
# JSON and access a first item in a list under a key "crimes"
params = {
"areacode": desired_area_code,
"crimetypes": CRIME_CODE,
"groupby": "area", # ensures that there will be only one item in the list
}
crimes = get("http://mapakriminality.cz/api/crimes", params=params).json()["crimes"]
if len(crimes) == 1: # if there are some results
crimes = crimes[0]
print("{} crimes found in this area, {} of them solved".format(crimes["Found"], crimes["Solved"]))
else: # no results means (most probably) that user entered invalid area code
print("Invalid area code entered")
sys.exit(1)
#!/usr/bin/env python3
import json
from requests import get
from bs4 import BeautifulSoup
# make HTTP GET request for people and access HTML by .text attribute of response
html = get("https://www.linuxdays.cz/2015/lide").text
soup = BeautifulSoup(html, "html.parser")
# prepare dict for results
people = {}
# find all elements with class="karta" (contains info about people)
# note the _ after class, this prevents `class` to be interpreted as a keyword
for person in soup.find_all(class_="karta"):
# note that the base element for finding details is `person`
name = person.find("h3").text
description = person.find("p").text
people[name] = description
# print people in CSV format (separated by TAB) to opened file
with open("people.csv", "w") as csv_file:
for name, description in people.items():
print(name, end="\t", file=csv_file)
print(description, file=csv_file)
# write people info in JSON
with open("people.json", "w") as json_file:
json.dump(people, json_file, ensure_ascii=False, indent=2)
#!/usr/bin/env python3
import click
from requests import get
from bs4 import BeautifulSoup
from datetime import date
@click.command()
@click.argument("frm")
@click.argument("to")
@click.option("--date", default="{0:%Y%m%d}".format(date.today()))
@click.option("--tarif", default="REGULAR")
@click.option("--credit/--no-credit", default=False)
def studentagency_lines(frm, to, tarif, date, credit):
# assemble URL
url = "https://jizdenky.studentagency.cz/m/Booking/from/{frm}/to/{to}/tarif/{tarif}/departure/{date}/retdep/{date}/return/false/credit/{credit}".format(
frm=frm.upper(), to=to.upper(), tarif=tarif.upper(), date=date, credit=str(credit).lower())
# create request and HTML parser
response = get(url)
soup = BeautifulSoup(response.text, "html.parser")
# prepare list for results
lines = []
# parse all lines
for line in soup.find_all(class_="show-detail"):
try:
# try to interpret transfer as int
transfer = int(line.find(class_="transfer").text)
except ValueError:
transfer = 0
# add a line to result list
lines.append({
"vehicle": line.find(class_="type-img").find("img")["title"],
"departure": line.find(class_="departure").text,
"arrival": line.find(class_="arrival").text,
"transfer": transfer,
"free": int(line.find(class_="free").text),
"price": int(line.find(class_="price").text)
})
# print result
for line in lines:
print("{departure:>8} {arrival:>8} {free:>5} {price:>7} Kc".format(**line))
# if running from console
if __name__ == "__main__":
studentagency_lines() # click handles the rest
requests==2.8.0
beautifulsoup4==4.4.1
click==5.1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment