Last active
April 7, 2020 09:58
-
-
Save iwebroot/913ef70332d2fbfb6b9858b060dc8ad5 to your computer and use it in GitHub Desktop.
Script - Scrapping Stage Bafa
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Created on Tue Jul 30 12:05:36 2019 | |
@author: 6066305 | |
""" | |
from urllib import request | |
from datetime import datetime | |
from bs4 import BeautifulSoup | |
import csv | |
from selenium.webdriver.support.ui import Select | |
import pandas as pd | |
from pandas.io import sql | |
import mysql.connector | |
import time | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.common.keys import Keys | |
from selenium.webdriver import ActionChains | |
con = mysql.connector.connect(host="localhost", | |
user="appbox_stagebafa", | |
password="Fg#}@qi=Wrh#", | |
database="appbox_stagebafa") | |
def getDataAfocal(): | |
dateDebut = [] | |
dateFin = [] | |
dates = [] | |
themes = [] | |
lieu = [] | |
accueil = [] | |
infos = [] | |
dico_values = {"AG" : "Générale", "AA": "Approfondissement", "AQ" : "Qualification"} | |
for value, type in dico_values.items(): | |
def getAll(url, browser): | |
rege_url = "https://www.afocal.fr/bafa/" + url | |
browser.get(rege_url) | |
pages = browser.page_source | |
soup = BeautifulSoup(pages, 'lxml') | |
for tr in soup.find_all('tr')[2:]: | |
tds = tr.find_all('td') | |
lieu.append(tds[1].text.split("(")[0]) | |
dateDebut.append(tds[2].text[3:11]) | |
dateFin.append(tds[2].text[14:22]) | |
accueil.append(tds[3].text.split(":")[0]) | |
if (value == "AG"): | |
themes.append("General") | |
else: | |
themes.append(tds[0].text) | |
infos.append("https://www.afocal.fr/bafa/" + str(tds[4]).split('"')[3]) | |
browser = webdriver.Chrome() | |
browser.get('https://www.afocal.fr/bafa/recherche-bafa-0-1.html') | |
select = Select(browser.find_element_by_id('sel_type')) | |
select.select_by_value(value) | |
browser.execute_script("document.forms[0].submit();") | |
page = browser.page_source | |
# parse the html using beautifulsoup | |
html_content = BeautifulSoup(page, 'html.parser') | |
html_contents = html_content.find('div', attrs={'id': 'tablo'}) | |
for tr in html_contents.find_all('tr')[2:]: | |
tds = tr.find_all('td') | |
lieu.append(tds[1].text.split("(")[0]) | |
dateDebut.append(tds[2].text[3:11]) | |
dateFin.append(tds[2].text[14:22]) | |
accueil.append(tds[3].text.split(":")[0]) | |
if (value == "AG"): | |
themes.append("General") | |
else: | |
themes.append(tds[0].text) | |
infos.append("https://www.afocal.fr/bafa/" + str(tds[4]).split('"')[3]) | |
pages = html_content.find('div', attrs={'class': 'pages'}) | |
for element in str(pages).split('"'): | |
if "recherche" in element: | |
getAll(str(element),browser) | |
browser.quit() | |
toCsv = {"DateDebut" : dateDebut, "DateFin" : dateFin, "Themes" : themes, "Lieu" : lieu, "Accueil" : accueil, "infos" : infos} | |
cursor = con.cursor() | |
add_row = ("INSERT INTO bafaComp " | |
"VALUES (%(DateDebut)s, %(DateFin)s, %(Themes)s, %(Lieu)s, %(Accueil)s, %(infos)s ) ") | |
for index, row in pd.DataFrame.from_dict(toCsv).iterrows(): | |
print(row.to_dict()) | |
cursor.execute(add_row, row.to_dict()) | |
con.commit() | |
#générer les fichier s utiles | |
def getDataBafaBafd(): | |
browser = webdriver.Chrome() | |
browser.get('http://bafa-bafd-foyersruraux.org/trouver-son-stage-de-formation') | |
browser.execute_script("lancer_recherche();") | |
page_source = browser.page_source | |
browser.quit() | |
html_content = BeautifulSoup(page_source,'lxml') | |
dates = html_content.find_all('li', attrs={'class': 'col bafad-periode mobile-only'}) | |
for i in range(len(dates)): | |
dates[i] = dates[i].text.replace('Dates','') | |
themes = html_content.find_all('li', attrs={'class': 'col bafad-theme'}) | |
for i in range(len(themes)): | |
themes[i] = themes[i].text.replace('Thématique','') | |
lieu = html_content.find_all('li', attrs={'class': 'col bafad-lieu'}) | |
for i in range(len(lieu)): | |
lieu[i] = lieu[i].text.replace('Lieu','') | |
accueil = html_content.find_all('li', attrs={'class': 'col bafad-accueil'}) | |
for i in range(len(accueil)): | |
accueil[i] = accueil[i].text.replace('Accueil','') | |
infos = html_content.find_all('li', attrs={'class': 'col bafad-infos'}) | |
for i in range(len(infos)): | |
if (len(str(infos[i]).split('"'))>3): | |
infos[i] = "http://bafa-bafd-foyersruraux.org/" + str(infos[i]).split('"')[3] | |
toCsv = {"Date" : dates, "Themes" : themes[1:], "Lieu" : lieu[1:], "Accueil" : accueil[1:], "infos" : infos[1:]} | |
import pandas as pd | |
print(toCsv) | |
#pd.DataFrame.from_dict(toCsv).to_csv("SiteBafaBafd.csv", encoding="utf-8") | |
def getDataLeoLagrange(): | |
dates = [] | |
themes = [] | |
lieu = [] | |
accueil = [] | |
infos = [] | |
browser = webdriver.Chrome() | |
browser.get('https://www.bafa-bafd.org/formation/?region=12&saison=&type=1') | |
i = 1 | |
while 1: | |
try: | |
browser.execute_script("javascript:clickering(" + str(i) + ");") | |
page_source = browser.page_source | |
html_content = BeautifulSoup(page_source,'lxml') | |
j = 0 | |
for tr in html_content.find_all('tr')[1:]: | |
try: | |
tds = tr.find_all('td') | |
lieu.append(tds[2].text) | |
dates.append(tds[0].text) | |
accueil.append(tds[3].text) | |
themes.append(tds[1].text) | |
infos.append("https://www.bafa-bafd.org/" + str(tds[5]).split('"')[3]) | |
j+=1 | |
except: | |
continue | |
i+=1 | |
if j==0: | |
break | |
except: | |
break | |
toCsv = {"Date" : dates, "Themes" : themes, "Lieu" : lieu, "Accueil" : accueil, "infos" : infos} | |
import pandas as pd | |
pd.DataFrame.from_dict(toCsv).to_csv("SiteLeoLagrange.csv", encoding="utf-8") | |
browser.quit() | |
def getDataFnfr(): | |
dates = [] | |
themes = [] | |
lieu = [] | |
accueil = [] | |
infos = [] | |
browser = webdriver.Chrome() | |
i = 1 | |
while 1: | |
browser.get('http://www.ma-formation-bafa.fr/Resultat_Recherche.php?formation=tous&periode=tous®ion=18&arbo=2&row_arbo_principal=&row_arbo=Accueil&val_formation=&val_periode=&val_region=&periodetext=Toutes&page='+str(i)+"&tri="+str(i)) | |
page_source = browser.page_source | |
html_content = BeautifulSoup(page_source,'lxml') | |
j = 0 | |
for tr in html_content.find_all('tr')[1:]: | |
try: | |
ths = tr.find_all('th')[0].find_all('span') | |
tds = tr.find_all('td') | |
accueiltext = "" | |
if (len(str(tds[0].text))>2): | |
accueiltext += "Internat " | |
if (len(str(tds[1].text))>2): | |
accueiltext += "Demi-pension " | |
if (len(str(tds[2].text))>2): | |
accueiltext += "Externat " | |
lieu.append(ths[2].text) | |
dates.append(ths[1].text) | |
accueil.append(accueiltext) | |
themes.append(ths[0].text) | |
infos.append("http://www.ma-formation-bafa.fr/" + str(tds[3]).split('"')[3]) | |
j+=1 | |
except: | |
continue | |
i+=1 | |
if j==0: | |
break | |
toCsv = {"Date" : dates, "Themes" : themes, "Lieu" : lieu, "Accueil" : accueil, "infos" : infos} | |
import pandas as pd | |
print(toCsv) | |
#pd.DataFrame.from_dict(toCsv).to_csv("SiteFnfr.csv", encoding="utf-8") | |
browser.quit() | |
def atcRouteDuMonde(): | |
dates = [] | |
themes = [] | |
lieu = [] | |
accueil = [] | |
infos = [] | |
browser = webdriver.Chrome() | |
i = 1 | |
browser.get('https://www.atc-routesdumonde.com/formations-bafa-bafd/calendrier-des-sessions/bafa-formation-generale/') | |
page_source = browser.page_source | |
html_content = BeautifulSoup(page_source,'lxml') | |
text = html_content.find_all(text=True) | |
for i in range(len(text)): | |
try: | |
if "du" in text[i].lower() and "au" in text[i].lower() and len(text[i]) < 150 : | |
p = 0 | |
for j in range(15): | |
if len(set(text[i+j])) > 3: | |
if p == 0: | |
dates.append(text[i+j]) | |
elif p == 1: | |
lieu.append(text[i+j]) | |
elif p == 2: | |
accueil.append(text[i+j][1:]) | |
elif p == 3: | |
themes.append("General") | |
p+=1 | |
infos.append("https://www.atc-routesdumonde.com/formations-bafa-bafd/contact-brochure-devis/") | |
except: | |
continue | |
browser.get('https://www.atc-routesdumonde.com/formations-bafa-bafd/calendrier-des-sessions/approfondissement-et-qualification/') | |
page_source = browser.page_source | |
html_content = BeautifulSoup(page_source,'lxml') | |
text = html_content.find_all(text=True) | |
for i in range(len(text)): | |
try: | |
if "du" in text[i].lower() and "au" in text[i].lower() and len(text[i]) < 150 : | |
p = 0 | |
for j in range(15): | |
print(text[i+j]) | |
if len(set(text[i+j])) > 3: | |
if p == 0: | |
dates.append(text[i+j]) | |
elif p == 1: | |
themes.append(text[i+j]) | |
elif p == 2: | |
lieu.append(text[i+j][1:]) | |
elif p == 3: | |
accueil.append(text[i+j][1:]) | |
p+=1 | |
infos.append("https://www.atc-routesdumonde.com/formations-bafa-bafd/contact-brochure-devis/") | |
except: | |
continue | |
toCsv = {"Date" : dates, "Themes" : themes, "Lieu" : lieu, "Accueil" : accueil, "infos" : infos} | |
import pandas as pd | |
print(toCsv) | |
#pd.DataFrame.from_dict(toCsv).to_csv("SiteFnfr.csv", encoding="utf-8") | |
browser.quit() | |
def assoCfag(): | |
dates = [] | |
themes = [] | |
lieu = [] | |
accueil = [] | |
infos = [] | |
browser = webdriver.Chrome() | |
i = 0 | |
while 1 : | |
browser.get('http://www.asso-cfag.com/stages/bafa/?page=' + str(i)) | |
page_source = browser.page_source | |
html_content = BeautifulSoup(page_source,'lxml') | |
for element in html_content.find_all('div', attrs={'class': 'columns mb25'}): | |
j = 0 | |
for text in element.find_all(text=True): | |
if len(text) > 3: | |
if j==0: | |
themes.append(text) | |
if j==3: | |
if (len(text.split(","))>=4): | |
lieu.append(text.split(",")[3]) | |
else: | |
lieu.append(text.split(",")[0]) | |
if j==4: | |
dates.append(text) | |
j+=1 | |
for link in element.find_all('a'): | |
infos.append(link["href"]) | |
accueil.append("Non renseigné") | |
if len( html_content.find_all('div', attrs={'class': 'columns mb25'})) == 0: | |
break | |
i+=1 | |
""" | |
<div class="columns mb25"> | |
<div id="post-6548" class="course-post post-6548 course type-course status-publish hentry course_cat-bafa course_cat-qualification-surveillant-de-baignade thematic-surveillant-de-baignade" data-maps-posts="" data-place="Lycée" data-address="" data-postcode="29200" data-city="Brest"> | |
<div class="row"> | |
<div class="medium-12 small-6 columns"> | |
<a href="http://www.asso-cfag.com/stage/qualification-surveillant-de-baignade-51/"><img src="http://www.asso-cfag.com/wp-content/themes/cfag/assets/images/default/default-post.jpg" alt="Qualification surveillant de baignade"></a> </div> | |
<div class="medium-12 small-6 columns"> | |
<header> | |
<h2><a href="http://www.asso-cfag.com/stage/qualification-surveillant-de-baignade-51/">Qualification surveillant de baignade</a></h2> | |
</header> | |
</div> | |
<div class="small-12 columns"> | |
<div class="entry-content"> | |
<div class="course_info icon-ribbon"> | |
<div> | |
BAFA - Qualification surveillant de baignade </div> | |
</div> | |
<div class="course_info icon-map"> | |
<div>Finistère</div> | |
<div>Lycée, , 29200, Brest</div> | |
</div> | |
<div class="course_info icon-date"> | |
<div>Toussaint - du 19/10/2019 au 26/10/2019</div> | |
</div> | |
<a href="http://www.asso-cfag.com/stage/qualification-surveillant-de-baignade-51/" class="button tiny float-right mb20">Plus d'infos</a> | |
""" | |
toCsv = {"Date" : dates, "Themes" : themes, "Lieu" : lieu, "Accueil" : accueil, "infos" : infos} | |
import pandas as pd | |
print(toCsv) | |
#pd.DataFrame.from_dict(toCsv).to_csv("SiteFnfr.csv", encoding="utf-8") | |
browser.quit() | |
def scoutismeFrancais(): | |
dates = [] | |
themes = [] | |
lieu = [] | |
accueil = [] | |
infos = [] | |
browser = webdriver.Chrome() | |
i = 0 | |
regions = ['https://www.scoutisme-francais.fr/formation/bourgogne-franche-comte', | |
'https://www.scoutisme-francais.fr/formation/hauts-de-france', | |
'https://www.scoutisme-francais.fr/formation/bretagne', | |
'https://www.scoutisme-francais.fr/formation/ile-de-france', | |
'https://www.scoutisme-francais.fr/formation/pays-de-la-loire', | |
'https://www.scoutisme-francais.fr/Formation/grand-est', | |
'https://www.scoutisme-francais.fr/formation/normandie', | |
'https://www.scoutisme-francais.fr/formation/Auvergne-Rhone-Alpes', | |
'https://www.scoutisme-francais.fr/Formation/Provence-Alpes-Cote-dAzur', | |
'https://www.scoutisme-francais.fr/formation/corse', | |
'https://www.scoutisme-francais.fr/formation/occitanie', | |
'https://www.scoutisme-francais.fr/formation/centre-val-de-loire', | |
'https://www.scoutisme-francais.fr/formation/nouvelle-aquitaine'] | |
for link in regions: | |
browser.get(link) | |
time.sleep(3) | |
page_source = browser.page_source | |
html_content = BeautifulSoup(page_source,'lxml') | |
for element in html_content.find_all('div', attrs={'class': 'ag-row ag-row-no-focus ag-row-even ag-row-level-0'})[1:]: | |
j = 0 | |
if (len(element.text)>0 and "BAFD" not in element.text): | |
stri = element.text | |
stri = stri.replace("Scouts et Guides De France", "") | |
stri = stri.replace("Eclaireuses Eclaireurs de France", "") | |
if "Formation d’approfondissement" in stri: | |
themes.append("Formation d’approfondissement BAFA") | |
dates.append(stri[34:54]) | |
lieu.append(stri[54:]) | |
accueil.append("Non renseigné") | |
infos.append("https://www.scoutisme-francais.fr/formation/bafa") | |
else: | |
themes.append("Formation générale BAFA") | |
dates.append(stri[23:43]) | |
lieu.append(stri[43:]) | |
accueil.append("Non renseigné") | |
infos.append("https://www.scoutisme-francais.fr/formation/bafa") | |
print(dates) | |
print(lieu) | |
toCsv = {"Date" : dates, "Themes" : themes, "Lieu" : lieu, "Accueil" : accueil, "infos" : infos} | |
import pandas as pd | |
print(toCsv) | |
#pd.DataFrame.from_dict(toCsv).to_csv("SiteFnfr.csv", encoding="utf-8") | |
browser.quit() | |
def CEMEA(): | |
dates = [] | |
themes = [] | |
lieu = [] | |
accueil = [] | |
infos = [] | |
browser = webdriver.Chrome() | |
i = 0 | |
browser.get("https://cemea-formation.com/f/bafa/") | |
slider = browser.find_element_by_id("slider-radius") | |
width = slider.size['width'] | |
print(width) | |
move = ActionChains(browser) | |
move.click_and_hold(slider).move_by_offset(0,0).release().perform() | |
move.click_and_hold(slider).move_by_offset(180,0).release().perform() | |
time.sleep(1) | |
moreLoaded = False | |
# click submit button | |
while moreLoaded == False: | |
try: | |
submit_button = browser.find_element_by_id('more') | |
submit_button.click() | |
time.sleep(1) | |
except: | |
moreLoaded = True | |
page_source = browser.page_source | |
html_content = BeautifulSoup(page_source,'lxml') | |
for typeFormation in html_content.find_all('span', attrs={'class': 'type'})[1:]: | |
themes.append(typeFormation.text) | |
for ville in html_content.find_all('span', attrs={'class': 'ville'})[1:]: | |
lieu.append(ville.text) | |
for date in html_content.find_all('span', attrs={'class': 'date'})[1:]: | |
dates.append(date.text) | |
for a in html_content.find_all('a', attrs={'class': 'pure-button pure-button-primary'})[1:]: | |
infos.append("https://cemea-formation.com/f/bafa" + a["href"]) | |
accueil.append("Non renseigné") | |
submit_button = browser.find_elements_by_xpath("//a[@class='swipe-next arrow']")[0] | |
submit_button.click() | |
time.sleep(1) | |
slider = browser.find_element_by_id("slider-radius") | |
width = slider.size['width'] | |
print(width) | |
move = ActionChains(browser) | |
move.click_and_hold(slider).move_by_offset(0,0).release().perform() | |
move.click_and_hold(slider).move_by_offset(180,0).release().perform() | |
time.sleep(1) | |
moreLoaded = False | |
# click submit button | |
while moreLoaded == False: | |
try: | |
submit_button = browser.find_element_by_id('more') | |
submit_button.click() | |
time.sleep(1) | |
except: | |
moreLoaded = True | |
page_source = browser.page_source | |
html_content = BeautifulSoup(page_source,'lxml') | |
for typeFormation in html_content.find_all('span', attrs={'class': 'type'})[1:]: | |
themes.append(typeFormation.text) | |
for ville in html_content.find_all('span', attrs={'class': 'ville'})[1:]: | |
lieu.append(ville.text) | |
for date in html_content.find_all('span', attrs={'class': 'date'})[1:]: | |
dates.append(date.text) | |
for a in html_content.find_all('a', attrs={'class': 'pure-button pure-button-primary'})[1:]: | |
infos.append("https://cemea-formation.com/f/bafa" + a["href"]) | |
accueil.append("Non renseigné") | |
toCsv = {"Date" : dates, "Themes" : themes, "Lieu" : lieu, "Accueil" : accueil, "infos" : infos} | |
import pandas as pd | |
print(toCsv) | |
#pd.DataFrame.from_dict(toCsv).to_csv("SiteFnfr.csv", encoding="utf-8") | |
browser.quit() | |
delays = [7, 4, 6, 2, 10, 19] | |
def getData(): | |
organismeLiens = [] | |
browser = webdriver.Chrome() | |
browser.get('http://www.formation-animation.com/list/diplome/formation_bafa.html') | |
page_source = browser.page_source | |
html_content = BeautifulSoup(page_source,'lxml') | |
for tr in html_content.find_all('tr', attrs={'bgcolor': '#FFFFFF'}): | |
dateDebut = [] | |
dateFin = [] | |
themes = [] | |
lieu = [] | |
accueil = [] | |
infos = [] | |
organisme = [] | |
browser.get(tr.find_all("a")[0]["href"]) | |
page_sources = browser.page_source | |
html_contents = BeautifulSoup(page_sources,'lxml') | |
for a in html_contents.find_all("a"): | |
try: | |
if "http://www.formation-animation.com/view.php?job_id=" in a["href"]: | |
browser.get(a["href"]) | |
pge_src = browser.page_source | |
html = BeautifulSoup(pge_src, 'lxml') | |
table = html.find_all('td', attrs={'class': 'view'}) | |
accueilStr= "" | |
p = 0 | |
for text in table: | |
if p >= 0: | |
if p==0: | |
themes.append(text.getText().strip().split("\n")[0].strip()) | |
if p==2: | |
dic = text.getText().strip() | |
dateDebut.append(dic[6:] + "-" + dic[3:5] + "-" + dic[0:2]) | |
elif p==3: | |
dic = text.getText().strip() | |
dateFin.append(dic[6:] + "-" + dic[3:5] + "-" + dic[0:2]) | |
elif p==4: | |
organisme.append(text.getText().strip()) | |
elif p==9: | |
lieu.append(text.getText().strip().split(" - ")[0]) | |
if "Externat".lower() in text.getText().strip().lower(): | |
accueilStr += "Externat" | |
if "1/2 pension".lower() in text.getText().strip().lower(): | |
accueilStr += "Demi-pension" | |
if "Internat".lower() in text.getText().strip().lower(): | |
accueilStr += "Internat" | |
p+=1 | |
try: | |
if "http://www.formation-animation.com/view.php?company_id=" in a["href"]: | |
organismeLiens.append(a["href"]) | |
except: | |
continue | |
accueil.append(accueilStr) | |
delay = np.random.choice(delays) | |
time.sleep(delay) | |
infos.append(a["href"]) | |
toCsv = {"DateDebut" : dateDebut, "DateFin" : dateFin, "Themes" : themes,"Organisme" : organisme, "Lieu" : lieu, "Accueil" : accueil, "infos" : infos} | |
cursor = con.cursor() | |
add_row = ("INSERT INTO bafaComp " | |
"VALUES (%(DateDebut)s, %(DateFin)s, %(Themes)s, %(Organisme)s, %(Lieu)s, %(Accueil)s, %(infos)s ) ") | |
for index, row in pd.DataFrame.from_dict(toCsv).iterrows(): | |
print(row.to_dict()) | |
cursor.execute(add_row, row.to_dict()) | |
con.commit() | |
except Exception as e: | |
print(e) | |
continue | |
browser.get('http://www.formation-animation.com/list/diplome/formation_bafa.html') | |
browser.quit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment