Shayan Riyaz ShayanRiyaz

## 02_LA_neighborhood_scrapping.py
url = requests.get('https://en.wikipedia.org/wiki/List_of_districts_and_neighbourhoods_of_Los_Angeles').text
soup = BeautifulSoup(url,"html.parser")

lis = []
for li in soup.findAll('li'):
    if li.find(href="/wiki/Portal:Los_Angeles"):
        break
    if li.find(href=re.compile("^/wiki/")):
        lis.append(li)
    if li.text=='Pico Robertson[34]': #Pico Robertson is the only item on the list that does not have a hyperlink reference

## 03_clean_neighborhood_df.py
df['Neighbourhood'] = df.Neighbourhood.str.partition('[')[0] #Removes the citation and reference brackets
df['Neighbourhood'] = df.Neighbourhood.str.partition(',')[0] #Removes the alternatives for 'Bel Air'
df=df[df.Neighbourhood!='Baldwin Hills/Crenshaw'] #Removes redundancy as 'Baldwin Hills' and 'Crenshaw' exist already
df=df[df.Neighbourhood!='Hollywood Hills West'] #Removes redundancy as it has the same coordinates as 'Hollywood Hills'
df=df[df.Neighbourhood!='Brentwood Circle'] #Removes redundancy as it has the same coordinates as 'Brentwood'
df=df[df.Neighbourhood!='Wilshire Park'] #Removes redundancy as it has the same coordinates as 'Wilshire Center'
df.reset_index(inplace=True,drop=True)

## 04_Neighborhood_GeoPy.py
# define the data frame columns
column_names = ['Neighbourhood', 'Latitude', 'Longitude']

# instantiate the data frame
nhoods = pd.DataFrame(columns=column_names)

geolocator = Nominatim(user_agent="la_explorer",timeout=5)
for i in range(0,len(df)):

    address = df.Neighbourhood[i]+', Los Angeles'

## 05_LA_Map_Folium.py
address = 'Los Angeles, USA'

geolocator = Nominatim(user_agent="la_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of {} are {}, {}.'.format(address,latitude, longitude))

# create map of LA using latitude and longitude values
map_la = folium.Map(location=[latitude, longitude], zoom_start=10)

## 06_Foursquare_API.py
CLIENT_ID = '' # Foursquare ID
CLIENT_SECRET = '' # Foursquare Secret
VERSION = '' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET: ' + CLIENT_SECRET)

## 07_Exploring_First_Neighborhood.py
neighbourhood_latitude = nhoods.loc[0, 'Latitude'] # neighbourhood latitude value
neighbourhood_longitude = nhoods.loc[0, 'Longitude'] # neighbourhood longitude value

neighbourhood_name = nhoods.loc[0, 'Neighbourhood'] # neighbourhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name,
                                                               neighbourhood_latitude,
                                                               neighbourhood_longitude))

LIMIT = 100 # limit of number of venues returned by Foursquare API

## 08_Explore_neighborhood_over_ten.py
def getNearbyVenues(names, latitudes, longitudes, radius=500):

    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):


        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID,
            CLIENT_SECRET,

## 09_Neighbour_encode.py
# one hot encoding
la_onehot = pd.get_dummies(la_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighbourhood column back to data frame
la_onehot['Neighbourhood'] = la_venues['Neighbourhood']

# move neighbourhood column to the first column
fixed_columns = [la_onehot.columns[-1]] + list(la_onehot.columns[:-1])
la_onehot = la_onehot[fixed_columns]

## 10_top_ten_venues.py
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)

    return row_categories_sorted.index.values[0:num_top_venues]

  num_top_venues = 12

indicators = ['st', 'nd', 'rd']

## 11_k_means_neighborhood.py
from sklearn.metrics import silhouette_score

la_grouped_clustering = la_grouped.drop('Neighbourhood', 1)

for n_cluster in range(2, 12):
    kmeans = KMeans(n_clusters=n_cluster).fit(la_grouped_clustering)
    label = kmeans.labels_
    sil_coeff = silhouette_score(la_grouped_clustering, label, metric='euclidean')
    print("For n_clusters={}, The Silhouette Coefficient is {}".format(n_cluster, sil_coeff))
	url = requests.get('https://en.wikipedia.org/wiki/List_of_districts_and_neighbourhoods_of_Los_Angeles').text
	soup = BeautifulSoup(url,"html.parser")

	lis = []
	for li in soup.findAll('li'):
	if li.find(href="/wiki/Portal:Los_Angeles"):
	break
	if li.find(href=re.compile("^/wiki/")):
	lis.append(li)
	if li.text=='Pico Robertson[34]': #Pico Robertson is the only item on the list that does not have a hyperlink reference
	df['Neighbourhood'] = df.Neighbourhood.str.partition('[')[0] #Removes the citation and reference brackets
	df['Neighbourhood'] = df.Neighbourhood.str.partition(',')[0] #Removes the alternatives for 'Bel Air'
	df=df[df.Neighbourhood!='Baldwin Hills/Crenshaw'] #Removes redundancy as 'Baldwin Hills' and 'Crenshaw' exist already
	df=df[df.Neighbourhood!='Hollywood Hills West'] #Removes redundancy as it has the same coordinates as 'Hollywood Hills'
	df=df[df.Neighbourhood!='Brentwood Circle'] #Removes redundancy as it has the same coordinates as 'Brentwood'
	df=df[df.Neighbourhood!='Wilshire Park'] #Removes redundancy as it has the same coordinates as 'Wilshire Center'
	df.reset_index(inplace=True,drop=True)
	# define the data frame columns
	column_names = ['Neighbourhood', 'Latitude', 'Longitude']

	# instantiate the data frame
	nhoods = pd.DataFrame(columns=column_names)

	geolocator = Nominatim(user_agent="la_explorer",timeout=5)
	for i in range(0,len(df)):

	address = df.Neighbourhood[i]+', Los Angeles'
	address = 'Los Angeles, USA'

	geolocator = Nominatim(user_agent="la_explorer")
	location = geolocator.geocode(address)
	latitude = location.latitude
	longitude = location.longitude
	print('The geograpical coordinates of {} are {}, {}.'.format(address,latitude, longitude))

	# create map of LA using latitude and longitude values
	map_la = folium.Map(location=[latitude, longitude], zoom_start=10)
	CLIENT_ID = '' # Foursquare ID
	CLIENT_SECRET = '' # Foursquare Secret
	VERSION = '' # Foursquare API version

	print('Your credentails:')
	print('CLIENT_ID: ' + CLIENT_ID)
	print('CLIENT_SECRET: ' + CLIENT_SECRET)
	neighbourhood_latitude = nhoods.loc[0, 'Latitude'] # neighbourhood latitude value
	neighbourhood_longitude = nhoods.loc[0, 'Longitude'] # neighbourhood longitude value

	neighbourhood_name = nhoods.loc[0, 'Neighbourhood'] # neighbourhood name

	print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name,
	neighbourhood_latitude,
	neighbourhood_longitude))

	LIMIT = 100 # limit of number of venues returned by Foursquare API
	def getNearbyVenues(names, latitudes, longitudes, radius=500):

	venues_list=[]
	for name, lat, lng in zip(names, latitudes, longitudes):


	# create the API request URL
	url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
	CLIENT_ID,
	CLIENT_SECRET,
	# one hot encoding
	la_onehot = pd.get_dummies(la_venues[['Venue Category']], prefix="", prefix_sep="")

	# add neighbourhood column back to data frame
	la_onehot['Neighbourhood'] = la_venues['Neighbourhood']

	# move neighbourhood column to the first column
	fixed_columns = [la_onehot.columns[-1]] + list(la_onehot.columns[:-1])
	la_onehot = la_onehot[fixed_columns]
	def return_most_common_venues(row, num_top_venues):
	row_categories = row.iloc[1:]
	row_categories_sorted = row_categories.sort_values(ascending=False)

	return row_categories_sorted.index.values[0:num_top_venues]

	num_top_venues = 12

	indicators = ['st', 'nd', 'rd']
	from sklearn.metrics import silhouette_score

	la_grouped_clustering = la_grouped.drop('Neighbourhood', 1)

	for n_cluster in range(2, 12):
	kmeans = KMeans(n_clusters=n_cluster).fit(la_grouped_clustering)
	label = kmeans.labels_
	sil_coeff = silhouette_score(la_grouped_clustering, label, metric='euclidean')
	print("For n_clusters={}, The Silhouette Coefficient is {}".format(n_cluster, sil_coeff))