John DeJesus JohnDeJesus22

## gist:1bc62cd459a5d9182f4005575b8aecf2
# Import Libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Get webpage with requests
web_page = requests.get('https://www.newvisions.org/ams2/pages/our-staff2')

## gist:ec1f264d0a718c779d1b296c7b0e6a12
# Parse web_page
soup = BeautifulSoup(web_page.text, 'html.parser')

# Create set of results based on HTML tags with desired data
results = soup.find_all('div', attrs={'class':'matrix-content'})
results = results[29:]

# See the length of the results
len(results)

## gist:c74bd0b2e10c90c6f6cb0bc767c2b6b7
# Testing with the first teacher and obtaining the name
test_result = results[0]
test_result.find('h5')
test_result.find('h5').text

# Obtaining position(s)
test_result.find('p').text.strip('\n\t')

# Obtaining email
test_result.find('em').get_text()

## gist:658b2b02f80fb04c310219b7b06833a7
# Search for sum of duplicate
print(df.duplicated(['Teacher_Names']).sum())

# Eliminating duplicates
df.drop_duplicates(['Teacher_Names'], keep='first', inplace=True)

# Export to csv without numbered indices
df.to_csv('BronxSchoolStaffInfo.csv', index=False)

## gist:538bdddc5febca8b5ad82904f8baef33
# Initiate DataFrame object
df = pd.DataFrame()

# Get all the teacher names
df['Teacher_Names'] = [result.find('h5').text for result in results]

# Get all the position titles
df['Positions'] = [result.find('p').text.strip('\n\t') for result in results]

# Create a function to get emails since some are missing

## JssuniSitedownload
# Import Libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Get webpage with requests
web_page = requests.get('https://jssuni.edu.in/JSSWeb/WebShowFromDB.aspx?MODE=SSMD&PID=10002&CID=3&DID=2&MID=0&SMID=10402')

## Jssunitest
# Get first result
test_result = results[0]

# Name
test_result.find('h2').text

# Designation
test_result.find('p').contents[1].strip(' ')

# Email

## JssuniPageparser
# Parse the HTML
soup = BeautifulSoup(web_page.text, 'html.parser')

# Create Set with HTML tags based on results
results = soup.find_all('div', attrs={'class':'tab-pane active in fade'})

# Check results and remove first tag since it isn't a profile block
len(results)
results = results[1:]
len(results)

## JssuniScrap
# Initialize Data Frame
df = pd.DataFrame()

# Names
df['Name'] = [result.find('h2').text for result in results]

# Designation
df['Designation'] = [result.find('p').contents[1].strip(' ') for result in results]

# Email

## kmeansgrouper.py
from sklearn.cluster import KMeans
import json
import plotly
import plotly.graph_objs as go


class KmeansGrouper:

    # may need a default about of clusters (maybe 2?) since a convergence warning due to duplicates values
    # ex: only found 2 when we requested 4.
	# Import Libraries
	import requests
	import pandas as pd
	from bs4 import BeautifulSoup

	# Get webpage with requests
	web_page = requests.get('https://www.newvisions.org/ams2/pages/our-staff2')
	# Parse web_page
	soup = BeautifulSoup(web_page.text, 'html.parser')

	# Create set of results based on HTML tags with desired data
	results = soup.find_all('div', attrs={'class':'matrix-content'})
	results = results[29:]

	# See the length of the results
	len(results)
	# Testing with the first teacher and obtaining the name
	test_result = results[0]
	test_result.find('h5')
	test_result.find('h5').text

	# Obtaining position(s)
	test_result.find('p').text.strip('\n\t')

	# Obtaining email
	test_result.find('em').get_text()
	# Search for sum of duplicate
	print(df.duplicated(['Teacher_Names']).sum())

	# Eliminating duplicates
	df.drop_duplicates(['Teacher_Names'], keep='first', inplace=True)

	# Export to csv without numbered indices
	df.to_csv('BronxSchoolStaffInfo.csv', index=False)
	# Initiate DataFrame object
	df = pd.DataFrame()

	# Get all the teacher names
	df['Teacher_Names'] = [result.find('h5').text for result in results]

	# Get all the position titles
	df['Positions'] = [result.find('p').text.strip('\n\t') for result in results]

	# Create a function to get emails since some are missing
	# Get first result
	test_result = results[0]

	# Name
	test_result.find('h2').text

	# Designation
	test_result.find('p').contents[1].strip(' ')

	# Email
	# Parse the HTML
	soup = BeautifulSoup(web_page.text, 'html.parser')

	# Create Set with HTML tags based on results
	results = soup.find_all('div', attrs={'class':'tab-pane active in fade'})

	# Check results and remove first tag since it isn't a profile block
	len(results)
	results = results[1:]
	len(results)
	# Initialize Data Frame
	df = pd.DataFrame()

	# Names
	df['Name'] = [result.find('h2').text for result in results]

	# Designation
	df['Designation'] = [result.find('p').contents[1].strip(' ') for result in results]

	# Email
	from sklearn.cluster import KMeans
	import json
	import plotly
	import plotly.graph_objs as go


	class KmeansGrouper:

	# may need a default about of clusters (maybe 2?) since a convergence warning due to duplicates values
	# ex: only found 2 when we requested 4.