Skip to content

Instantly share code, notes, and snippets.

@Eastkap
Last active January 16, 2017 08:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Eastkap/5ca63128bb045a1e624342a96fff9974 to your computer and use it in GitHub Desktop.
Save Eastkap/5ca63128bb045a1e624342a96fff9974 to your computer and use it in GitHub Desktop.
scrapping vroom vroom and making data to base de donnees
import sys, re
from bs4 import BeautifulSoup
def fnum(s):
for i in range(len(s)):
if (s[i].isdigit()):
return i
if sys.version_info[0] == 3:
from urllib.request import urlopen
else:
# Not Python 3 - today, it is most likely to be Python 2
# But note that this might need an update when Python 4
# might be around one day
from urllib import urlopen
# Your code where you can use urlopen
with urlopen("https://www.vroomvroom.fr/auto-ecoles/paris/paris") as url:
s = url.read()
main = BeautifulSoup(s, 'html.parser')
with open('/Users/Jacobo/Documents/python/vroom/bvrooma.csv', 'wb') as f:
f.write(('nom,adresse,cp,ville,taux\n').encode('utf-8'))
with urlopen('https://www.vroomvroom.fr/auto-ecoles/paris/paris/paris-police') as url:
# print("https://www.vroomvroom.fr"+auto['href'])
s = url.read()
soup = BeautifulSoup(s, 'html.parser')
# titre/nom
f.write((soup.title.text[:-24]).encode('utf-8'))
f.write(','.encode('utf-8'))
# adresse
f.write('"'.encode('utf-8') + soup.find("span", {"itemprop": "streetAddress"}).text.encode('utf-8') + '"'.encode(
'utf-8'))
f.write(','.encode('utf-8'))
# cp
f.write(soup.find("span", {"itemprop": "postalCode"}).text.encode('utf-8'))
f.write(','.encode('utf-8'))
# ville
f.write(soup.find("span", {"itemprop": "addressLocality"}).text.encode('utf-8'))
# les taux
x = 0
for taux in soup.findAll("div", {"class": "row row-success-rates"}):
if (x != 0):
if ('Code' in taux.text):
#f.write('\n'.encode('utf-8'))
break
print(taux.text[1:-22])
x = 1
s = taux.text
m = re.search("\d", s)
if (m):
m = m.start()
if s[m + 1] == ')':
a = re.search("\d", s[m + 1:]).start()
m +=a+1
#print(a,m)
elif s[m+2]==')':
a = re.search("\d", s[m + 2:]).start()
m += a + 2
f.write(','.encode('utf-8'))
f.write(s[1:m - 1].encode('utf-8'))
f.write(','.encode('utf-8'))
f.write(s[m:m + 2].encode('utf-8'))
f.write(','.encode('utf-8'))
m = re.search("sur \d", s)
if m:
m = m.end() - 1
f.write(s[m:m + 2].encode('utf-8'))
f.write('\n'.encode('utf-8'))
import csv
def dif(text, header):
for i in range(len(header)):
if (header[i] == (text+'taux')) or (header[i] == (text+'sur')):
return i
return -1
def ajout(ligne, rows):
tab = []
for i in range(4):
tab.append(ligne.pop(0))
while (ligne != []):
#print(ligne)
a = dif(ligne[0], rows[0][4:])
if a == 0:
ligne.pop(0)
elif a > 0:
print(tab)
for i in range(a*2):
tab.append(' ')
print(tab)
else:
x=ligne.pop(0)
#rows[0].append(x)
rows[0].append(x+'taux')
rows[0].append(x + 'sur')
if(ligne!=[]):
for i in range(2):
tab.append(ligne.pop(0))
rows.append(tab)
# header[4:]
with open('/Users/Jacobo/Documents/python/vroom/vroomf.csv','rt',encoding='utf8') as f:
lignes=csv.reader(f)
rows=[['nom','adresse','cp', 'ville']]
for ligne in lignes:
ajout(ligne, rows)
with open('/Users/Jacobo/Documents/python/vroom/trif.csv','wt',encoding='utf8') as f:
writer = csv.writer(f)
rows.pop(1)
writer.writerows(rows)
import sys, re
from bs4 import BeautifulSoup
def fnum(s):
for i in range(len(s)):
if (s[i].isdigit()):
return i
if sys.version_info[0] == 3:
from urllib.request import urlopen
else:
# Not Python 3 - today, it is most likely to be Python 2
# But note that this might need an update when Python 4
# might be around one day
from urllib import urlopen
# Your code where you can use urlopen
with urlopen("https://www.vroomvroom.fr/auto-ecoles/paris/paris") as url:
s = url.read()
main = BeautifulSoup(s, 'html.parser')
with open('/Users/Jacobo/Documents/python/vroom/vroomf.csv', 'wb') as f:
f.write(('nom,adresse,cp,ville,taux\n').encode('utf-8'))
for auto in main.findAll("a", {"class": "school-listing-title"}):
with urlopen("https://www.vroomvroom.fr" + auto['href']) as url:
print("https://www.vroomvroom.fr" + auto['href'])
s = url.read()
soup = BeautifulSoup(s, 'html.parser')
# titre/nom
f.write((soup.title.text[:-24]).encode('utf-8'))
f.write(','.encode('utf-8'))
# adresse
f.write(
'"'.encode('utf-8') + soup.find("span", {"itemprop": "streetAddress"}).text.encode('utf-8') + '"'.encode(
'utf-8'))
f.write(','.encode('utf-8'))
# cp
f.write(soup.find("span", {"itemprop": "postalCode"}).text.encode('utf-8'))
f.write(','.encode('utf-8'))
# ville
f.write(soup.find("span", {"itemprop": "addressLocality"}).text.encode('utf-8'))
# les taux
x = 0
for taux in soup.findAll("div", {"class": "row row-success-rates"}):
if (x != 0):
if ('Code' in taux.text):
# f.write('\n'.encode('utf-8'))
break
# print(taux.text[1:-22])
x = 1
s = taux.text
m = re.search("\d", s)
if (m):
m = m.start()
if s[m + 1] == ')':
a = re.search("\d", s[m + 1:]).start()
m += a + 1
elif s[m + 2] == ')':
a = re.search("\d", s[m + 2:]).start()
m += a + 2
f.write(','.encode('utf-8'))
f.write(s[1:m - 1].encode('utf-8'))
f.write(','.encode('utf-8'))
f.write(s[m:m + 2].encode('utf-8'))
f.write(','.encode('utf-8'))
m = re.search("sur \d", s)
if m:
m = m.end() - 1
f.write(s[m:m + 2].encode('utf-8'))
f.write('\n'.encode('utf-8'))
# print(soup.prettify())
# soup.title.text : titre 'CER LEGENDRE à Paris - Vroomvroom.fr'
# soup.find("span", {"itemprop": "streetAddress"}).text
# soup.find("span", {"itemprop": "postalCode"}).text
# soup.find("span", {"itemprop": "addressLocality"}).text
# soup.findAll("div", {"class": "row row-success-rates"})[i].text
# ici lidee serait de check loccurence nouvelle de \nCode de la route\ pour arreter
# on: https://www.vroomvroom.fr/auto-ecoles/paris/paris
# soup.findAll("a", {"class": "school-listing-title"})[0]
# soup.findAll("a", {"class": "school-listing-title"})[0]['href']
# print(s)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment