scrapping vroom vroom and making data to base de donnees
import sys, re | |
from bs4 import BeautifulSoup | |
def fnum(s): | |
for i in range(len(s)): | |
if (s[i].isdigit()): | |
return i | |
if sys.version_info[0] == 3: | |
from urllib.request import urlopen | |
else: | |
# Not Python 3 - today, it is most likely to be Python 2 | |
# But note that this might need an update when Python 4 | |
# might be around one day | |
from urllib import urlopen | |
# Your code where you can use urlopen | |
with urlopen("https://www.vroomvroom.fr/auto-ecoles/paris/paris") as url: | |
s = url.read() | |
main = BeautifulSoup(s, 'html.parser') | |
with open('/Users/Jacobo/Documents/python/vroom/bvrooma.csv', 'wb') as f: | |
f.write(('nom,adresse,cp,ville,taux\n').encode('utf-8')) | |
with urlopen('https://www.vroomvroom.fr/auto-ecoles/paris/paris/paris-police') as url: | |
# print("https://www.vroomvroom.fr"+auto['href']) | |
s = url.read() | |
soup = BeautifulSoup(s, 'html.parser') | |
# titre/nom | |
f.write((soup.title.text[:-24]).encode('utf-8')) | |
f.write(','.encode('utf-8')) | |
# adresse | |
f.write('"'.encode('utf-8') + soup.find("span", {"itemprop": "streetAddress"}).text.encode('utf-8') + '"'.encode( | |
'utf-8')) | |
f.write(','.encode('utf-8')) | |
# cp | |
f.write(soup.find("span", {"itemprop": "postalCode"}).text.encode('utf-8')) | |
f.write(','.encode('utf-8')) | |
# ville | |
f.write(soup.find("span", {"itemprop": "addressLocality"}).text.encode('utf-8')) | |
# les taux | |
x = 0 | |
for taux in soup.findAll("div", {"class": "row row-success-rates"}): | |
if (x != 0): | |
if ('Code' in taux.text): | |
#f.write('\n'.encode('utf-8')) | |
break | |
print(taux.text[1:-22]) | |
x = 1 | |
s = taux.text | |
m = re.search("\d", s) | |
if (m): | |
m = m.start() | |
if s[m + 1] == ')': | |
a = re.search("\d", s[m + 1:]).start() | |
m +=a+1 | |
#print(a,m) | |
elif s[m+2]==')': | |
a = re.search("\d", s[m + 2:]).start() | |
m += a + 2 | |
f.write(','.encode('utf-8')) | |
f.write(s[1:m - 1].encode('utf-8')) | |
f.write(','.encode('utf-8')) | |
f.write(s[m:m + 2].encode('utf-8')) | |
f.write(','.encode('utf-8')) | |
m = re.search("sur \d", s) | |
if m: | |
m = m.end() - 1 | |
f.write(s[m:m + 2].encode('utf-8')) | |
f.write('\n'.encode('utf-8')) |
import csv | |
def dif(text, header): | |
for i in range(len(header)): | |
if (header[i] == (text+'taux')) or (header[i] == (text+'sur')): | |
return i | |
return -1 | |
def ajout(ligne, rows): | |
tab = [] | |
for i in range(4): | |
tab.append(ligne.pop(0)) | |
while (ligne != []): | |
#print(ligne) | |
a = dif(ligne[0], rows[0][4:]) | |
if a == 0: | |
ligne.pop(0) | |
elif a > 0: | |
print(tab) | |
for i in range(a*2): | |
tab.append(' ') | |
print(tab) | |
else: | |
x=ligne.pop(0) | |
#rows[0].append(x) | |
rows[0].append(x+'taux') | |
rows[0].append(x + 'sur') | |
if(ligne!=[]): | |
for i in range(2): | |
tab.append(ligne.pop(0)) | |
rows.append(tab) | |
# header[4:] | |
with open('/Users/Jacobo/Documents/python/vroom/vroomf.csv','rt',encoding='utf8') as f: | |
lignes=csv.reader(f) | |
rows=[['nom','adresse','cp', 'ville']] | |
for ligne in lignes: | |
ajout(ligne, rows) | |
with open('/Users/Jacobo/Documents/python/vroom/trif.csv','wt',encoding='utf8') as f: | |
writer = csv.writer(f) | |
rows.pop(1) | |
writer.writerows(rows) |
import sys, re | |
from bs4 import BeautifulSoup | |
def fnum(s): | |
for i in range(len(s)): | |
if (s[i].isdigit()): | |
return i | |
if sys.version_info[0] == 3: | |
from urllib.request import urlopen | |
else: | |
# Not Python 3 - today, it is most likely to be Python 2 | |
# But note that this might need an update when Python 4 | |
# might be around one day | |
from urllib import urlopen | |
# Your code where you can use urlopen | |
with urlopen("https://www.vroomvroom.fr/auto-ecoles/paris/paris") as url: | |
s = url.read() | |
main = BeautifulSoup(s, 'html.parser') | |
with open('/Users/Jacobo/Documents/python/vroom/vroomf.csv', 'wb') as f: | |
f.write(('nom,adresse,cp,ville,taux\n').encode('utf-8')) | |
for auto in main.findAll("a", {"class": "school-listing-title"}): | |
with urlopen("https://www.vroomvroom.fr" + auto['href']) as url: | |
print("https://www.vroomvroom.fr" + auto['href']) | |
s = url.read() | |
soup = BeautifulSoup(s, 'html.parser') | |
# titre/nom | |
f.write((soup.title.text[:-24]).encode('utf-8')) | |
f.write(','.encode('utf-8')) | |
# adresse | |
f.write( | |
'"'.encode('utf-8') + soup.find("span", {"itemprop": "streetAddress"}).text.encode('utf-8') + '"'.encode( | |
'utf-8')) | |
f.write(','.encode('utf-8')) | |
# cp | |
f.write(soup.find("span", {"itemprop": "postalCode"}).text.encode('utf-8')) | |
f.write(','.encode('utf-8')) | |
# ville | |
f.write(soup.find("span", {"itemprop": "addressLocality"}).text.encode('utf-8')) | |
# les taux | |
x = 0 | |
for taux in soup.findAll("div", {"class": "row row-success-rates"}): | |
if (x != 0): | |
if ('Code' in taux.text): | |
# f.write('\n'.encode('utf-8')) | |
break | |
# print(taux.text[1:-22]) | |
x = 1 | |
s = taux.text | |
m = re.search("\d", s) | |
if (m): | |
m = m.start() | |
if s[m + 1] == ')': | |
a = re.search("\d", s[m + 1:]).start() | |
m += a + 1 | |
elif s[m + 2] == ')': | |
a = re.search("\d", s[m + 2:]).start() | |
m += a + 2 | |
f.write(','.encode('utf-8')) | |
f.write(s[1:m - 1].encode('utf-8')) | |
f.write(','.encode('utf-8')) | |
f.write(s[m:m + 2].encode('utf-8')) | |
f.write(','.encode('utf-8')) | |
m = re.search("sur \d", s) | |
if m: | |
m = m.end() - 1 | |
f.write(s[m:m + 2].encode('utf-8')) | |
f.write('\n'.encode('utf-8')) | |
# print(soup.prettify()) | |
# soup.title.text : titre 'CER LEGENDRE à Paris - Vroomvroom.fr' | |
# soup.find("span", {"itemprop": "streetAddress"}).text | |
# soup.find("span", {"itemprop": "postalCode"}).text | |
# soup.find("span", {"itemprop": "addressLocality"}).text | |
# soup.findAll("div", {"class": "row row-success-rates"})[i].text | |
# ici lidee serait de check loccurence nouvelle de \nCode de la route\ pour arreter | |
# on: https://www.vroomvroom.fr/auto-ecoles/paris/paris | |
# soup.findAll("a", {"class": "school-listing-title"})[0] | |
# soup.findAll("a", {"class": "school-listing-title"})[0]['href'] | |
# print(s) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment