Skip to content

Instantly share code, notes, and snippets.

@iwebroot
Last active April 7, 2020 09:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save iwebroot/913ef70332d2fbfb6b9858b060dc8ad5 to your computer and use it in GitHub Desktop.
Save iwebroot/913ef70332d2fbfb6b9858b060dc8ad5 to your computer and use it in GitHub Desktop.
Script - Scrapping Stage Bafa
"""
Created on Tue Jul 30 12:05:36 2019
@author: 6066305
"""
from urllib import request
from datetime import datetime
from bs4 import BeautifulSoup
import csv
from selenium.webdriver.support.ui import Select
import pandas as pd
from pandas.io import sql
import mysql.connector
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
con = mysql.connector.connect(host="localhost",
user="appbox_stagebafa",
password="Fg#}@qi=Wrh#",
database="appbox_stagebafa")
def getDataAfocal():
dateDebut = []
dateFin = []
dates = []
themes = []
lieu = []
accueil = []
infos = []
dico_values = {"AG" : "Générale", "AA": "Approfondissement", "AQ" : "Qualification"}
for value, type in dico_values.items():
def getAll(url, browser):
rege_url = "https://www.afocal.fr/bafa/" + url
browser.get(rege_url)
pages = browser.page_source
soup = BeautifulSoup(pages, 'lxml')
for tr in soup.find_all('tr')[2:]:
tds = tr.find_all('td')
lieu.append(tds[1].text.split("(")[0])
dateDebut.append(tds[2].text[3:11])
dateFin.append(tds[2].text[14:22])
accueil.append(tds[3].text.split(":")[0])
if (value == "AG"):
themes.append("General")
else:
themes.append(tds[0].text)
infos.append("https://www.afocal.fr/bafa/" + str(tds[4]).split('"')[3])
browser = webdriver.Chrome()
browser.get('https://www.afocal.fr/bafa/recherche-bafa-0-1.html')
select = Select(browser.find_element_by_id('sel_type'))
select.select_by_value(value)
browser.execute_script("document.forms[0].submit();")
page = browser.page_source
# parse the html using beautifulsoup
html_content = BeautifulSoup(page, 'html.parser')
html_contents = html_content.find('div', attrs={'id': 'tablo'})
for tr in html_contents.find_all('tr')[2:]:
tds = tr.find_all('td')
lieu.append(tds[1].text.split("(")[0])
dateDebut.append(tds[2].text[3:11])
dateFin.append(tds[2].text[14:22])
accueil.append(tds[3].text.split(":")[0])
if (value == "AG"):
themes.append("General")
else:
themes.append(tds[0].text)
infos.append("https://www.afocal.fr/bafa/" + str(tds[4]).split('"')[3])
pages = html_content.find('div', attrs={'class': 'pages'})
for element in str(pages).split('"'):
if "recherche" in element:
getAll(str(element),browser)
browser.quit()
toCsv = {"DateDebut" : dateDebut, "DateFin" : dateFin, "Themes" : themes, "Lieu" : lieu, "Accueil" : accueil, "infos" : infos}
cursor = con.cursor()
add_row = ("INSERT INTO bafaComp "
"VALUES (%(DateDebut)s, %(DateFin)s, %(Themes)s, %(Lieu)s, %(Accueil)s, %(infos)s ) ")
for index, row in pd.DataFrame.from_dict(toCsv).iterrows():
print(row.to_dict())
cursor.execute(add_row, row.to_dict())
con.commit()
#générer les fichier s utiles
def getDataBafaBafd():
browser = webdriver.Chrome()
browser.get('http://bafa-bafd-foyersruraux.org/trouver-son-stage-de-formation')
browser.execute_script("lancer_recherche();")
page_source = browser.page_source
browser.quit()
html_content = BeautifulSoup(page_source,'lxml')
dates = html_content.find_all('li', attrs={'class': 'col bafad-periode mobile-only'})
for i in range(len(dates)):
dates[i] = dates[i].text.replace('Dates','')
themes = html_content.find_all('li', attrs={'class': 'col bafad-theme'})
for i in range(len(themes)):
themes[i] = themes[i].text.replace('Thématique','')
lieu = html_content.find_all('li', attrs={'class': 'col bafad-lieu'})
for i in range(len(lieu)):
lieu[i] = lieu[i].text.replace('Lieu','')
accueil = html_content.find_all('li', attrs={'class': 'col bafad-accueil'})
for i in range(len(accueil)):
accueil[i] = accueil[i].text.replace('Accueil','')
infos = html_content.find_all('li', attrs={'class': 'col bafad-infos'})
for i in range(len(infos)):
if (len(str(infos[i]).split('"'))>3):
infos[i] = "http://bafa-bafd-foyersruraux.org/" + str(infos[i]).split('"')[3]
toCsv = {"Date" : dates, "Themes" : themes[1:], "Lieu" : lieu[1:], "Accueil" : accueil[1:], "infos" : infos[1:]}
import pandas as pd
print(toCsv)
#pd.DataFrame.from_dict(toCsv).to_csv("SiteBafaBafd.csv", encoding="utf-8")
def getDataLeoLagrange():
dates = []
themes = []
lieu = []
accueil = []
infos = []
browser = webdriver.Chrome()
browser.get('https://www.bafa-bafd.org/formation/?region=12&saison=&type=1')
i = 1
while 1:
try:
browser.execute_script("javascript:clickering(" + str(i) + ");")
page_source = browser.page_source
html_content = BeautifulSoup(page_source,'lxml')
j = 0
for tr in html_content.find_all('tr')[1:]:
try:
tds = tr.find_all('td')
lieu.append(tds[2].text)
dates.append(tds[0].text)
accueil.append(tds[3].text)
themes.append(tds[1].text)
infos.append("https://www.bafa-bafd.org/" + str(tds[5]).split('"')[3])
j+=1
except:
continue
i+=1
if j==0:
break
except:
break
toCsv = {"Date" : dates, "Themes" : themes, "Lieu" : lieu, "Accueil" : accueil, "infos" : infos}
import pandas as pd
pd.DataFrame.from_dict(toCsv).to_csv("SiteLeoLagrange.csv", encoding="utf-8")
browser.quit()
def getDataFnfr():
dates = []
themes = []
lieu = []
accueil = []
infos = []
browser = webdriver.Chrome()
i = 1
while 1:
browser.get('http://www.ma-formation-bafa.fr/Resultat_Recherche.php?formation=tous&periode=tous&region=18&arbo=2&row_arbo_principal=&row_arbo=Accueil&val_formation=&val_periode=&val_region=&periodetext=Toutes&page='+str(i)+"&tri="+str(i))
page_source = browser.page_source
html_content = BeautifulSoup(page_source,'lxml')
j = 0
for tr in html_content.find_all('tr')[1:]:
try:
ths = tr.find_all('th')[0].find_all('span')
tds = tr.find_all('td')
accueiltext = ""
if (len(str(tds[0].text))>2):
accueiltext += "Internat "
if (len(str(tds[1].text))>2):
accueiltext += "Demi-pension "
if (len(str(tds[2].text))>2):
accueiltext += "Externat "
lieu.append(ths[2].text)
dates.append(ths[1].text)
accueil.append(accueiltext)
themes.append(ths[0].text)
infos.append("http://www.ma-formation-bafa.fr/" + str(tds[3]).split('"')[3])
j+=1
except:
continue
i+=1
if j==0:
break
toCsv = {"Date" : dates, "Themes" : themes, "Lieu" : lieu, "Accueil" : accueil, "infos" : infos}
import pandas as pd
print(toCsv)
#pd.DataFrame.from_dict(toCsv).to_csv("SiteFnfr.csv", encoding="utf-8")
browser.quit()
def atcRouteDuMonde():
dates = []
themes = []
lieu = []
accueil = []
infos = []
browser = webdriver.Chrome()
i = 1
browser.get('https://www.atc-routesdumonde.com/formations-bafa-bafd/calendrier-des-sessions/bafa-formation-generale/')
page_source = browser.page_source
html_content = BeautifulSoup(page_source,'lxml')
text = html_content.find_all(text=True)
for i in range(len(text)):
try:
if "du" in text[i].lower() and "au" in text[i].lower() and len(text[i]) < 150 :
p = 0
for j in range(15):
if len(set(text[i+j])) > 3:
if p == 0:
dates.append(text[i+j])
elif p == 1:
lieu.append(text[i+j])
elif p == 2:
accueil.append(text[i+j][1:])
elif p == 3:
themes.append("General")
p+=1
infos.append("https://www.atc-routesdumonde.com/formations-bafa-bafd/contact-brochure-devis/")
except:
continue
browser.get('https://www.atc-routesdumonde.com/formations-bafa-bafd/calendrier-des-sessions/approfondissement-et-qualification/')
page_source = browser.page_source
html_content = BeautifulSoup(page_source,'lxml')
text = html_content.find_all(text=True)
for i in range(len(text)):
try:
if "du" in text[i].lower() and "au" in text[i].lower() and len(text[i]) < 150 :
p = 0
for j in range(15):
print(text[i+j])
if len(set(text[i+j])) > 3:
if p == 0:
dates.append(text[i+j])
elif p == 1:
themes.append(text[i+j])
elif p == 2:
lieu.append(text[i+j][1:])
elif p == 3:
accueil.append(text[i+j][1:])
p+=1
infos.append("https://www.atc-routesdumonde.com/formations-bafa-bafd/contact-brochure-devis/")
except:
continue
toCsv = {"Date" : dates, "Themes" : themes, "Lieu" : lieu, "Accueil" : accueil, "infos" : infos}
import pandas as pd
print(toCsv)
#pd.DataFrame.from_dict(toCsv).to_csv("SiteFnfr.csv", encoding="utf-8")
browser.quit()
def assoCfag():
dates = []
themes = []
lieu = []
accueil = []
infos = []
browser = webdriver.Chrome()
i = 0
while 1 :
browser.get('http://www.asso-cfag.com/stages/bafa/?page=' + str(i))
page_source = browser.page_source
html_content = BeautifulSoup(page_source,'lxml')
for element in html_content.find_all('div', attrs={'class': 'columns mb25'}):
j = 0
for text in element.find_all(text=True):
if len(text) > 3:
if j==0:
themes.append(text)
if j==3:
if (len(text.split(","))>=4):
lieu.append(text.split(",")[3])
else:
lieu.append(text.split(",")[0])
if j==4:
dates.append(text)
j+=1
for link in element.find_all('a'):
infos.append(link["href"])
accueil.append("Non renseigné")
if len( html_content.find_all('div', attrs={'class': 'columns mb25'})) == 0:
break
i+=1
"""
<div class="columns mb25">
<div id="post-6548" class="course-post post-6548 course type-course status-publish hentry course_cat-bafa course_cat-qualification-surveillant-de-baignade thematic-surveillant-de-baignade" data-maps-posts="" data-place="Lycée" data-address="" data-postcode="29200" data-city="Brest">
<div class="row">
<div class="medium-12 small-6 columns">
<a href="http://www.asso-cfag.com/stage/qualification-surveillant-de-baignade-51/"><img src="http://www.asso-cfag.com/wp-content/themes/cfag/assets/images/default/default-post.jpg" alt="Qualification surveillant de baignade"></a> </div>
<div class="medium-12 small-6 columns">
<header>
<h2><a href="http://www.asso-cfag.com/stage/qualification-surveillant-de-baignade-51/">Qualification surveillant de baignade</a></h2>
</header>
</div>
<div class="small-12 columns">
<div class="entry-content">
<div class="course_info icon-ribbon">
<div>
BAFA - Qualification surveillant de baignade </div>
</div>
<div class="course_info icon-map">
<div>Finistère</div>
<div>Lycée, , 29200, Brest</div>
</div>
<div class="course_info icon-date">
<div>Toussaint - du 19/10/2019 au 26/10/2019</div>
</div>
<a href="http://www.asso-cfag.com/stage/qualification-surveillant-de-baignade-51/" class="button tiny float-right mb20">Plus d'infos</a>
"""
toCsv = {"Date" : dates, "Themes" : themes, "Lieu" : lieu, "Accueil" : accueil, "infos" : infos}
import pandas as pd
print(toCsv)
#pd.DataFrame.from_dict(toCsv).to_csv("SiteFnfr.csv", encoding="utf-8")
browser.quit()
def scoutismeFrancais():
dates = []
themes = []
lieu = []
accueil = []
infos = []
browser = webdriver.Chrome()
i = 0
regions = ['https://www.scoutisme-francais.fr/formation/bourgogne-franche-comte',
'https://www.scoutisme-francais.fr/formation/hauts-de-france',
'https://www.scoutisme-francais.fr/formation/bretagne',
'https://www.scoutisme-francais.fr/formation/ile-de-france',
'https://www.scoutisme-francais.fr/formation/pays-de-la-loire',
'https://www.scoutisme-francais.fr/Formation/grand-est',
'https://www.scoutisme-francais.fr/formation/normandie',
'https://www.scoutisme-francais.fr/formation/Auvergne-Rhone-Alpes',
'https://www.scoutisme-francais.fr/Formation/Provence-Alpes-Cote-dAzur',
'https://www.scoutisme-francais.fr/formation/corse',
'https://www.scoutisme-francais.fr/formation/occitanie',
'https://www.scoutisme-francais.fr/formation/centre-val-de-loire',
'https://www.scoutisme-francais.fr/formation/nouvelle-aquitaine']
for link in regions:
browser.get(link)
time.sleep(3)
page_source = browser.page_source
html_content = BeautifulSoup(page_source,'lxml')
for element in html_content.find_all('div', attrs={'class': 'ag-row ag-row-no-focus ag-row-even ag-row-level-0'})[1:]:
j = 0
if (len(element.text)>0 and "BAFD" not in element.text):
stri = element.text
stri = stri.replace("Scouts et Guides De France", "")
stri = stri.replace("Eclaireuses Eclaireurs de France", "")
if "Formation d’approfondissement" in stri:
themes.append("Formation d’approfondissement BAFA")
dates.append(stri[34:54])
lieu.append(stri[54:])
accueil.append("Non renseigné")
infos.append("https://www.scoutisme-francais.fr/formation/bafa")
else:
themes.append("Formation générale BAFA")
dates.append(stri[23:43])
lieu.append(stri[43:])
accueil.append("Non renseigné")
infos.append("https://www.scoutisme-francais.fr/formation/bafa")
print(dates)
print(lieu)
toCsv = {"Date" : dates, "Themes" : themes, "Lieu" : lieu, "Accueil" : accueil, "infos" : infos}
import pandas as pd
print(toCsv)
#pd.DataFrame.from_dict(toCsv).to_csv("SiteFnfr.csv", encoding="utf-8")
browser.quit()
def CEMEA():
dates = []
themes = []
lieu = []
accueil = []
infos = []
browser = webdriver.Chrome()
i = 0
browser.get("https://cemea-formation.com/f/bafa/")
slider = browser.find_element_by_id("slider-radius")
width = slider.size['width']
print(width)
move = ActionChains(browser)
move.click_and_hold(slider).move_by_offset(0,0).release().perform()
move.click_and_hold(slider).move_by_offset(180,0).release().perform()
time.sleep(1)
moreLoaded = False
# click submit button
while moreLoaded == False:
try:
submit_button = browser.find_element_by_id('more')
submit_button.click()
time.sleep(1)
except:
moreLoaded = True
page_source = browser.page_source
html_content = BeautifulSoup(page_source,'lxml')
for typeFormation in html_content.find_all('span', attrs={'class': 'type'})[1:]:
themes.append(typeFormation.text)
for ville in html_content.find_all('span', attrs={'class': 'ville'})[1:]:
lieu.append(ville.text)
for date in html_content.find_all('span', attrs={'class': 'date'})[1:]:
dates.append(date.text)
for a in html_content.find_all('a', attrs={'class': 'pure-button pure-button-primary'})[1:]:
infos.append("https://cemea-formation.com/f/bafa" + a["href"])
accueil.append("Non renseigné")
submit_button = browser.find_elements_by_xpath("//a[@class='swipe-next arrow']")[0]
submit_button.click()
time.sleep(1)
slider = browser.find_element_by_id("slider-radius")
width = slider.size['width']
print(width)
move = ActionChains(browser)
move.click_and_hold(slider).move_by_offset(0,0).release().perform()
move.click_and_hold(slider).move_by_offset(180,0).release().perform()
time.sleep(1)
moreLoaded = False
# click submit button
while moreLoaded == False:
try:
submit_button = browser.find_element_by_id('more')
submit_button.click()
time.sleep(1)
except:
moreLoaded = True
page_source = browser.page_source
html_content = BeautifulSoup(page_source,'lxml')
for typeFormation in html_content.find_all('span', attrs={'class': 'type'})[1:]:
themes.append(typeFormation.text)
for ville in html_content.find_all('span', attrs={'class': 'ville'})[1:]:
lieu.append(ville.text)
for date in html_content.find_all('span', attrs={'class': 'date'})[1:]:
dates.append(date.text)
for a in html_content.find_all('a', attrs={'class': 'pure-button pure-button-primary'})[1:]:
infos.append("https://cemea-formation.com/f/bafa" + a["href"])
accueil.append("Non renseigné")
toCsv = {"Date" : dates, "Themes" : themes, "Lieu" : lieu, "Accueil" : accueil, "infos" : infos}
import pandas as pd
print(toCsv)
#pd.DataFrame.from_dict(toCsv).to_csv("SiteFnfr.csv", encoding="utf-8")
browser.quit()
delays = [7, 4, 6, 2, 10, 19]
def getData():
organismeLiens = []
browser = webdriver.Chrome()
browser.get('http://www.formation-animation.com/list/diplome/formation_bafa.html')
page_source = browser.page_source
html_content = BeautifulSoup(page_source,'lxml')
for tr in html_content.find_all('tr', attrs={'bgcolor': '#FFFFFF'}):
dateDebut = []
dateFin = []
themes = []
lieu = []
accueil = []
infos = []
organisme = []
browser.get(tr.find_all("a")[0]["href"])
page_sources = browser.page_source
html_contents = BeautifulSoup(page_sources,'lxml')
for a in html_contents.find_all("a"):
try:
if "http://www.formation-animation.com/view.php?job_id=" in a["href"]:
browser.get(a["href"])
pge_src = browser.page_source
html = BeautifulSoup(pge_src, 'lxml')
table = html.find_all('td', attrs={'class': 'view'})
accueilStr= ""
p = 0
for text in table:
if p >= 0:
if p==0:
themes.append(text.getText().strip().split("\n")[0].strip())
if p==2:
dic = text.getText().strip()
dateDebut.append(dic[6:] + "-" + dic[3:5] + "-" + dic[0:2])
elif p==3:
dic = text.getText().strip()
dateFin.append(dic[6:] + "-" + dic[3:5] + "-" + dic[0:2])
elif p==4:
organisme.append(text.getText().strip())
elif p==9:
lieu.append(text.getText().strip().split(" - ")[0])
if "Externat".lower() in text.getText().strip().lower():
accueilStr += "Externat"
if "1/2 pension".lower() in text.getText().strip().lower():
accueilStr += "Demi-pension"
if "Internat".lower() in text.getText().strip().lower():
accueilStr += "Internat"
p+=1
try:
if "http://www.formation-animation.com/view.php?company_id=" in a["href"]:
organismeLiens.append(a["href"])
except:
continue
accueil.append(accueilStr)
delay = np.random.choice(delays)
time.sleep(delay)
infos.append(a["href"])
toCsv = {"DateDebut" : dateDebut, "DateFin" : dateFin, "Themes" : themes,"Organisme" : organisme, "Lieu" : lieu, "Accueil" : accueil, "infos" : infos}
cursor = con.cursor()
add_row = ("INSERT INTO bafaComp "
"VALUES (%(DateDebut)s, %(DateFin)s, %(Themes)s, %(Organisme)s, %(Lieu)s, %(Accueil)s, %(infos)s ) ")
for index, row in pd.DataFrame.from_dict(toCsv).iterrows():
print(row.to_dict())
cursor.execute(add_row, row.to_dict())
con.commit()
except Exception as e:
print(e)
continue
browser.get('http://www.formation-animation.com/list/diplome/formation_bafa.html')
browser.quit()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment