Skip to content

Instantly share code, notes, and snippets.

# Import Libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup
# Get webpage with requests
web_page = requests.get('https://www.newvisions.org/ams2/pages/our-staff2')
# Parse web_page
soup = BeautifulSoup(web_page.text, 'html.parser')
# Create set of results based on HTML tags with desired data
results = soup.find_all('div', attrs={'class':'matrix-content'})
results = results[29:]
# See the length of the results
len(results)
# Testing with the first teacher and obtaining the name
test_result = results[0]
test_result.find('h5')
test_result.find('h5').text
# Obtaining position(s)
test_result.find('p').text.strip('\n\t')
# Obtaining email
test_result.find('em').get_text()
# Search for sum of duplicate
print(df.duplicated(['Teacher_Names']).sum())
# Eliminating duplicates
df.drop_duplicates(['Teacher_Names'], keep='first', inplace=True)
# Export to csv without numbered indices
df.to_csv('BronxSchoolStaffInfo.csv', index=False)
# Initiate DataFrame object
df = pd.DataFrame()
# Get all the teacher names
df['Teacher_Names'] = [result.find('h5').text for result in results]
# Get all the position titles
df['Positions'] = [result.find('p').text.strip('\n\t') for result in results]
# Create a function to get emails since some are missing
@JohnDeJesus22
JohnDeJesus22 / JssuniSitedownload
Created August 21, 2019 16:56
JssuniSitedownload
# Import Libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup
# Get webpage with requests
web_page = requests.get('https://jssuni.edu.in/JSSWeb/WebShowFromDB.aspx?MODE=SSMD&PID=10002&CID=3&DID=2&MID=0&SMID=10402')
@JohnDeJesus22
JohnDeJesus22 / Jssunitest
Created August 21, 2019 18:45
Jssunitest
# Get first result
test_result = results[0]
# Name
test_result.find('h2').text
# Designation
test_result.find('p').contents[1].strip(' ')
# Email
@JohnDeJesus22
JohnDeJesus22 / JssuniPageparser
Last active August 21, 2019 19:40
JssuniPageparser
# Parse the HTML
soup = BeautifulSoup(web_page.text, 'html.parser')
# Create Set with HTML tags based on results
results = soup.find_all('div', attrs={'class':'tab-pane active in fade'})
# Check results and remove first tag since it isn't a profile block
len(results)
results = results[1:]
len(results)
@JohnDeJesus22
JohnDeJesus22 / JssuniScrap
Last active August 21, 2019 19:50
JssuniScrap
# Initialize Data Frame
df = pd.DataFrame()
# Names
df['Name'] = [result.find('h2').text for result in results]
# Designation
df['Designation'] = [result.find('p').contents[1].strip(' ') for result in results]
# Email
@JohnDeJesus22
JohnDeJesus22 / kmeansgrouper.py
Created October 22, 2019 14:20
Class to create categories for TeacherBoard web app feature.
from sklearn.cluster import KMeans
import json
import plotly
import plotly.graph_objs as go
class KmeansGrouper:
# may need a default about of clusters (maybe 2?) since a convergence warning due to duplicates values
# ex: only found 2 when we requested 4.