Skip to content

Instantly share code, notes, and snippets.

@Eastkap Eastkap/vroom_betatst.py
Last active Jan 16, 2017

Embed
What would you like to do?
scrapping vroom vroom and making data to base de donnees
import sys, re
from bs4 import BeautifulSoup
def fnum(s):
for i in range(len(s)):
if (s[i].isdigit()):
return i
if sys.version_info[0] == 3:
from urllib.request import urlopen
else:
# Not Python 3 - today, it is most likely to be Python 2
# But note that this might need an update when Python 4
# might be around one day
from urllib import urlopen
# Your code where you can use urlopen
with urlopen("https://www.vroomvroom.fr/auto-ecoles/paris/paris") as url:
s = url.read()
main = BeautifulSoup(s, 'html.parser')
with open('/Users/Jacobo/Documents/python/vroom/bvrooma.csv', 'wb') as f:
f.write(('nom,adresse,cp,ville,taux\n').encode('utf-8'))
with urlopen('https://www.vroomvroom.fr/auto-ecoles/paris/paris/paris-police') as url:
# print("https://www.vroomvroom.fr"+auto['href'])
s = url.read()
soup = BeautifulSoup(s, 'html.parser')
# titre/nom
f.write((soup.title.text[:-24]).encode('utf-8'))
f.write(','.encode('utf-8'))
# adresse
f.write('"'.encode('utf-8') + soup.find("span", {"itemprop": "streetAddress"}).text.encode('utf-8') + '"'.encode(
'utf-8'))
f.write(','.encode('utf-8'))
# cp
f.write(soup.find("span", {"itemprop": "postalCode"}).text.encode('utf-8'))
f.write(','.encode('utf-8'))
# ville
f.write(soup.find("span", {"itemprop": "addressLocality"}).text.encode('utf-8'))
# les taux
x = 0
for taux in soup.findAll("div", {"class": "row row-success-rates"}):
if (x != 0):
if ('Code' in taux.text):
#f.write('\n'.encode('utf-8'))
break
print(taux.text[1:-22])
x = 1
s = taux.text
m = re.search("\d", s)
if (m):
m = m.start()
if s[m + 1] == ')':
a = re.search("\d", s[m + 1:]).start()
m +=a+1
#print(a,m)
elif s[m+2]==')':
a = re.search("\d", s[m + 2:]).start()
m += a + 2
f.write(','.encode('utf-8'))
f.write(s[1:m - 1].encode('utf-8'))
f.write(','.encode('utf-8'))
f.write(s[m:m + 2].encode('utf-8'))
f.write(','.encode('utf-8'))
m = re.search("sur \d", s)
if m:
m = m.end() - 1
f.write(s[m:m + 2].encode('utf-8'))
f.write('\n'.encode('utf-8'))
import csv
def dif(text, header):
for i in range(len(header)):
if (header[i] == (text+'taux')) or (header[i] == (text+'sur')):
return i
return -1
def ajout(ligne, rows):
tab = []
for i in range(4):
tab.append(ligne.pop(0))
while (ligne != []):
#print(ligne)
a = dif(ligne[0], rows[0][4:])
if a == 0:
ligne.pop(0)
elif a > 0:
print(tab)
for i in range(a*2):
tab.append(' ')
print(tab)
else:
x=ligne.pop(0)
#rows[0].append(x)
rows[0].append(x+'taux')
rows[0].append(x + 'sur')
if(ligne!=[]):
for i in range(2):
tab.append(ligne.pop(0))
rows.append(tab)
# header[4:]
with open('/Users/Jacobo/Documents/python/vroom/vroomf.csv','rt',encoding='utf8') as f:
lignes=csv.reader(f)
rows=[['nom','adresse','cp', 'ville']]
for ligne in lignes:
ajout(ligne, rows)
with open('/Users/Jacobo/Documents/python/vroom/trif.csv','wt',encoding='utf8') as f:
writer = csv.writer(f)
rows.pop(1)
writer.writerows(rows)
import sys, re
from bs4 import BeautifulSoup
def fnum(s):
for i in range(len(s)):
if (s[i].isdigit()):
return i
if sys.version_info[0] == 3:
from urllib.request import urlopen
else:
# Not Python 3 - today, it is most likely to be Python 2
# But note that this might need an update when Python 4
# might be around one day
from urllib import urlopen
# Your code where you can use urlopen
with urlopen("https://www.vroomvroom.fr/auto-ecoles/paris/paris") as url:
s = url.read()
main = BeautifulSoup(s, 'html.parser')
with open('/Users/Jacobo/Documents/python/vroom/vroomf.csv', 'wb') as f:
f.write(('nom,adresse,cp,ville,taux\n').encode('utf-8'))
for auto in main.findAll("a", {"class": "school-listing-title"}):
with urlopen("https://www.vroomvroom.fr" + auto['href']) as url:
print("https://www.vroomvroom.fr" + auto['href'])
s = url.read()
soup = BeautifulSoup(s, 'html.parser')
# titre/nom
f.write((soup.title.text[:-24]).encode('utf-8'))
f.write(','.encode('utf-8'))
# adresse
f.write(
'"'.encode('utf-8') + soup.find("span", {"itemprop": "streetAddress"}).text.encode('utf-8') + '"'.encode(
'utf-8'))
f.write(','.encode('utf-8'))
# cp
f.write(soup.find("span", {"itemprop": "postalCode"}).text.encode('utf-8'))
f.write(','.encode('utf-8'))
# ville
f.write(soup.find("span", {"itemprop": "addressLocality"}).text.encode('utf-8'))
# les taux
x = 0
for taux in soup.findAll("div", {"class": "row row-success-rates"}):
if (x != 0):
if ('Code' in taux.text):
# f.write('\n'.encode('utf-8'))
break
# print(taux.text[1:-22])
x = 1
s = taux.text
m = re.search("\d", s)
if (m):
m = m.start()
if s[m + 1] == ')':
a = re.search("\d", s[m + 1:]).start()
m += a + 1
elif s[m + 2] == ')':
a = re.search("\d", s[m + 2:]).start()
m += a + 2
f.write(','.encode('utf-8'))
f.write(s[1:m - 1].encode('utf-8'))
f.write(','.encode('utf-8'))
f.write(s[m:m + 2].encode('utf-8'))
f.write(','.encode('utf-8'))
m = re.search("sur \d", s)
if m:
m = m.end() - 1
f.write(s[m:m + 2].encode('utf-8'))
f.write('\n'.encode('utf-8'))
# print(soup.prettify())
# soup.title.text : titre 'CER LEGENDRE à Paris - Vroomvroom.fr'
# soup.find("span", {"itemprop": "streetAddress"}).text
# soup.find("span", {"itemprop": "postalCode"}).text
# soup.find("span", {"itemprop": "addressLocality"}).text
# soup.findAll("div", {"class": "row row-success-rates"})[i].text
# ici lidee serait de check loccurence nouvelle de \nCode de la route\ pour arreter
# on: https://www.vroomvroom.fr/auto-ecoles/paris/paris
# soup.findAll("a", {"class": "school-listing-title"})[0]
# soup.findAll("a", {"class": "school-listing-title"})[0]['href']
# print(s)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.