Khuyen Tran khuyentran1401

## Web scrape Ghibli Movie DB.py
html = urlopen('https://www.themoviedb.org/list/4309')


bsObj = BeautifulSoup(html)
#Create 4 lists that contains all the url, movie's name, rank, and rating
urls = []
names = []
ranks = []
ratings = []
images = []

## multiplepages.py
url = urlopen("https://www.themoviedb.org/movie/81")
soup = BeautifulSoup(url)


#find summary
soup.find('div', {'class':'overview'}).p.get_text()


#find director
soup.find('li', {'class':'profile'}).a.get_text()

## dropMissingValues.py
#https://towardsdatascience.com/preprocessing-with-sklearn-a-complete-and-comprehensive-guide-670cb98fcfb9
X.dropna(axis=0, thresh=1, inplace=True)
#axis=0 is row, axis = 1 is colum
#thresh is the threshold (number of na) to drop the row or column
X.reset_index(inplace=True)
#reset the index and use the default one
X.drop(['index'], axis=1, inplace=True)
#drop old index column

## PuLP
m = read("filename.lp")

m.optimize()

m.printAttr("X")

## Discretizer.py
#uniform, where all bins in each feature have identical widths.
#quantile (default), where all bins in each feature have the same number of points.
#kmeans, where all values in each bin have the same nearest center of a 1D k-means cluster.

from sklearn.preprocessing import KBinsDiscretizer
disc = KBinsDiscretizer(n_bins=3, encode='uniform',
                        strategy='uniform')
disc.fit_transform(X)

## CustomTransformers.py
from sklearn.preprocessing import FunctionTransformer

transformer = FunctionTransformer(np.log1p, validate=True)

transformer.fit_transform(X.f2.values.reshape(-1, 1)) #same output

X.f2.apply(lambda x : np.log1p(x)) #same output

## procssNum.py
import re

def processNum(num):
  return float(re.sub(r'[^\w\s.]','',num))

#https://regexr.com/

## duplicates.py
data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4, 'k2': [1, 1, 2, 3, 3, 4, 4]})

#duplicated returns a boolean Series indicating whether each row is a duplicate or not:
data.duplicated()

#returns a DataFrame where the duplicated array is True:
data.drop_duplicates()

#specify the subset to detect duplicates
#filter based on column 'k1'

## pandasMap.py
data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami','corned beef', 'Bacon', 'pastrami', 'honey ham','nova lox'],'ounces': [4, 3, 12, 6,

#map between food to animal
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'

## renamingAxis.py
data = DataFrame(np.arange(12).reshape((3, 4)),index=['Ohio', 'Colorado', 'New York'],columns=['one', 'two', 'three', 'four'])

data.index.map(str.upper)
#Out: array([OHIO, COLORADO, NEW YORK], dtype=object)

data.index = data.index.map(str.upper)

#Alternative
data.rename(index=str.title, columns=str.upper)
	html = urlopen('https://www.themoviedb.org/list/4309')


	bsObj = BeautifulSoup(html)
	#Create 4 lists that contains all the url, movie's name, rank, and rating
	urls = []
	names = []
	ranks = []
	ratings = []
	images = []
	url = urlopen("https://www.themoviedb.org/movie/81")
	soup = BeautifulSoup(url)


	#find summary
	soup.find('div', {'class':'overview'}).p.get_text()


	#find director
	soup.find('li', {'class':'profile'}).a.get_text()
	#https://towardsdatascience.com/preprocessing-with-sklearn-a-complete-and-comprehensive-guide-670cb98fcfb9
	X.dropna(axis=0, thresh=1, inplace=True)
	#axis=0 is row, axis = 1 is colum
	#thresh is the threshold (number of na) to drop the row or column
	X.reset_index(inplace=True)
	#reset the index and use the default one
	X.drop(['index'], axis=1, inplace=True)
	#drop old index column
	#uniform, where all bins in each feature have identical widths.
	#quantile (default), where all bins in each feature have the same number of points.
	#kmeans, where all values in each bin have the same nearest center of a 1D k-means cluster.

	from sklearn.preprocessing import KBinsDiscretizer
	disc = KBinsDiscretizer(n_bins=3, encode='uniform',
	strategy='uniform')
	disc.fit_transform(X)
	from sklearn.preprocessing import FunctionTransformer

	transformer = FunctionTransformer(np.log1p, validate=True)

	transformer.fit_transform(X.f2.values.reshape(-1, 1)) #same output

	X.f2.apply(lambda x : np.log1p(x)) #same output
	import re

	def processNum(num):
	return float(re.sub(r'[^\w\s.]','',num))

	#https://regexr.com/
	data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4, 'k2': [1, 1, 2, 3, 3, 4, 4]})

	#duplicated returns a boolean Series indicating whether each row is a duplicate or not:
	data.duplicated()

	#returns a DataFrame where the duplicated array is True:
	data.drop_duplicates()

	#specify the subset to detect duplicates
	#filter based on column 'k1'
	data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami','corned beef', 'Bacon', 'pastrami', 'honey ham','nova lox'],'ounces': [4, 3, 12, 6,

	#map between food to animal
	meat_to_animal = {
	'bacon': 'pig',
	'pulled pork': 'pig',
	'pastrami': 'cow',
	'corned beef': 'cow',
	'honey ham': 'pig',
	'nova lox': 'salmon'
	data = DataFrame(np.arange(12).reshape((3, 4)),index=['Ohio', 'Colorado', 'New York'],columns=['one', 'two', 'three', 'four'])

	data.index.map(str.upper)
	#Out: array([OHIO, COLORADO, NEW YORK], dtype=object)

	data.index = data.index.map(str.upper)

	#Alternative
	data.rename(index=str.title, columns=str.upper)