Last active
November 11, 2015 19:01
-
-
Save tomasbedrich/bebb0e47783525b70487 to your computer and use it in GitHub Desktop.
Web scraping in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
from requests import get | |
CRIME_CODE = 821 # = bribery | |
# make HTTP GET request for areas, parse them as JSON and access a list under a key "areas" | |
areas = get("http://mapakriminality.cz/api/areas", params={"level": 2}).json()["areas"] | |
# print each area's code and name | |
for area in areas: | |
print(area["Code"], area["Name"]) | |
print("-" * 80) | |
# load desired area code from user | |
desired_area_code = input("Which area are you interested in (enter code): ") | |
# make HTTP GET request for crimes with more params, parse the response as | |
# JSON and access a first item in a list under a key "crimes" | |
params = { | |
"areacode": desired_area_code, | |
"crimetypes": CRIME_CODE, | |
"groupby": "area", # ensures that there will be only one item in the list | |
} | |
crimes = get("http://mapakriminality.cz/api/crimes", params=params).json()["crimes"] | |
if len(crimes) == 1: # if there are some results | |
crimes = crimes[0] | |
print("{} crimes found in this area, {} of them solved".format(crimes["Found"], crimes["Solved"])) | |
else: # no results means (most probably) that user entered invalid area code | |
print("Invalid area code entered") | |
sys.exit(1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import json | |
from requests import get | |
from bs4 import BeautifulSoup | |
# make HTTP GET request for people and access HTML by .text attribute of response | |
html = get("https://www.linuxdays.cz/2015/lide").text | |
soup = BeautifulSoup(html, "html.parser") | |
# prepare dict for results | |
people = {} | |
# find all elements with class="karta" (contains info about people) | |
# note the _ after class, this prevents `class` to be interpreted as a keyword | |
for person in soup.find_all(class_="karta"): | |
# note that the base element for finding details is `person` | |
name = person.find("h3").text | |
description = person.find("p").text | |
people[name] = description | |
# print people in CSV format (separated by TAB) to opened file | |
with open("people.csv", "w") as csv_file: | |
for name, description in people.items(): | |
print(name, end="\t", file=csv_file) | |
print(description, file=csv_file) | |
# write people info in JSON | |
with open("people.json", "w") as json_file: | |
json.dump(people, json_file, ensure_ascii=False, indent=2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import click | |
from requests import get | |
from bs4 import BeautifulSoup | |
from datetime import date | |
@click.command() | |
@click.argument("frm") | |
@click.argument("to") | |
@click.option("--date", default="{0:%Y%m%d}".format(date.today())) | |
@click.option("--tarif", default="REGULAR") | |
@click.option("--credit/--no-credit", default=False) | |
def studentagency_lines(frm, to, tarif, date, credit): | |
# assemble URL | |
url = "https://jizdenky.studentagency.cz/m/Booking/from/{frm}/to/{to}/tarif/{tarif}/departure/{date}/retdep/{date}/return/false/credit/{credit}".format( | |
frm=frm.upper(), to=to.upper(), tarif=tarif.upper(), date=date, credit=str(credit).lower()) | |
# create request and HTML parser | |
response = get(url) | |
soup = BeautifulSoup(response.text, "html.parser") | |
# prepare list for results | |
lines = [] | |
# parse all lines | |
for line in soup.find_all(class_="show-detail"): | |
try: | |
# try to interpret transfer as int | |
transfer = int(line.find(class_="transfer").text) | |
except ValueError: | |
transfer = 0 | |
# add a line to result list | |
lines.append({ | |
"vehicle": line.find(class_="type-img").find("img")["title"], | |
"departure": line.find(class_="departure").text, | |
"arrival": line.find(class_="arrival").text, | |
"transfer": transfer, | |
"free": int(line.find(class_="free").text), | |
"price": int(line.find(class_="price").text) | |
}) | |
# print result | |
for line in lines: | |
print("{departure:>8} {arrival:>8} {free:>5} {price:>7} Kc".format(**line)) | |
# if running from console | |
if __name__ == "__main__": | |
studentagency_lines() # click handles the rest |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
requests==2.8.0 | |
beautifulsoup4==4.4.1 | |
click==5.1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment