Skip to content

Instantly share code, notes, and snippets.

@RobertMatkulcik
Created January 17, 2016 15:03
Show Gist options
  • Save RobertMatkulcik/3d073be7de64738c3933 to your computer and use it in GitHub Desktop.
Save RobertMatkulcik/3d073be7de64738c3933 to your computer and use it in GitHub Desktop.
data scraping
# -*- coding: utf-8 -*-
import re
from bs4 import BeautifulSoup
from requests import Session
import csv
from selenium.webdriver import Firefox
from selenium.common.exceptions import NoSuchElementException
def titulky_content(session, url):
content = session.get(url).content
soup = BeautifulSoup(content, "html.parser")
# get element with desired info
el = soup.findAll("tr", class_="r1")
el += soup.findAll("tr", class_="r")
return el
def extract_film(film):
film_url = film.find("a")
film_name = film_url.text
film_href = film_url.get("href")
return [film_name, film_href]
# save to cvs
def csv_writer(filename, el):
with open(filename, "w") as file:
file_writer = csv.writer(file)
file_writer.writerow(["Názov", "Url"])
for el in el:
data = extract_film(el)
file_writer.writerow(data)
# read from csv
def csv_reader(filename):
with open(filename, "r") as file:
for row in file:
nozov_list = row
print(nozov_list)
# save csv-name to array
def to_array(filename):
makes = []
with open(filename, 'r') as f:
reader = csv.reader(f)
next(reader) # Ignore first row
for row in reader:
makes.append(row[0])
return makes
def get_driver():
# create driver instance
driver = Firefox()
return driver
def drive(name_array, csfd_url, int_i):
driver = get_driver()
driver.get(csfd_url)
csfd_search = driver.find_element_by_class_name("text")
csfd_search.send_keys(name_array[int_i])
driver.find_element_by_class_name("submit").click()
try:
csfd_vyhladane_prvy = driver.find_element_by_xpath('//*[@id="search-films"]/div[1]/ul[1]/li[1]/div/h3/a')
csfd_vyhladane_prvy.click()
except NoSuchElementException:
print("NoSuchElementException")
csfd_average = driver.find_element_by_class_name("average").text
csfd_average = re.sub('[!%]', '', csfd_average)
driver.quit()
return csfd_average
# def twod_list(columns, rows):
# a = [[x for x in range(columns)] for y in range(rows)]
# return a
def main(username, password):
##TITULKY.COM
login_url = "http://www.titulky.com/"
najnovsie_titulky_url = "http://www.titulky.com/?orderby=3&OrderDate=2"
##pre CSFD.COM
csfd_url = "http://www.csfd.cz/"
# output file csv
filename = "output.csv"
##TITULKY BEGIN
post_data = {
"Login": username,
"Password": password
}
# create session and perform login
session = Session()
session.post(login_url, post_data)
# visit my account page
el = titulky_content(session, najnovsie_titulky_url)
csv_writer(filename, el)
# csv_reader(filename)
##TITULKY END
##CSFD BEGIN
name_array = to_array(filename)
for int_i in range(len(name_array)):
try:
csfd_average = drive(name_array, csfd_url, int_i)
print(csfd_average)
except:
print("chyba nenasiel sa ziaden element")
break
##CSFD END
if __name__ == '__main__':
username = "E.T.Bong"
password = input()
main(username, password)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment