Skip to content

Instantly share code, notes, and snippets.

View khuyentran1401's full-sized avatar
🏠
Working from home

Khuyen Tran khuyentran1401

🏠
Working from home
View GitHub Profile
html = urlopen('https://www.themoviedb.org/list/4309')
bsObj = BeautifulSoup(html)
#Create 4 lists that contains all the url, movie's name, rank, and rating
urls = []
names = []
ranks = []
ratings = []
images = []
url = urlopen("https://www.themoviedb.org/movie/81")
soup = BeautifulSoup(url)
#find summary
soup.find('div', {'class':'overview'}).p.get_text()
#find director
soup.find('li', {'class':'profile'}).a.get_text()
@khuyentran1401
khuyentran1401 / dropMissingValues.py
Last active January 19, 2020 11:07
Deal with Missing Values
#https://towardsdatascience.com/preprocessing-with-sklearn-a-complete-and-comprehensive-guide-670cb98fcfb9
X.dropna(axis=0, thresh=1, inplace=True)
#axis=0 is row, axis = 1 is colum
#thresh is the threshold (number of na) to drop the row or column
X.reset_index(inplace=True)
#reset the index and use the default one
X.drop(['index'], axis=1, inplace=True)
#drop old index column
m = read("filename.lp")
m.optimize()
m.printAttr("X")
#uniform, where all bins in each feature have identical widths.
#quantile (default), where all bins in each feature have the same number of points.
#kmeans, where all values in each bin have the same nearest center of a 1D k-means cluster.
from sklearn.preprocessing import KBinsDiscretizer
disc = KBinsDiscretizer(n_bins=3, encode='uniform',
strategy='uniform')
disc.fit_transform(X)
from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(np.log1p, validate=True)
transformer.fit_transform(X.f2.values.reshape(-1, 1)) #same output
X.f2.apply(lambda x : np.log1p(x)) #same output
@khuyentran1401
khuyentran1401 / procssNum.py
Created January 16, 2020 15:28
Use to process number as string and return as number
import re
def processNum(num):
return float(re.sub(r'[^\w\s.]','',num))
#https://regexr.com/
@khuyentran1401
khuyentran1401 / duplicates.py
Created January 16, 2020 15:38
removing duplicate rows
data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4, 'k2': [1, 1, 2, 3, 3, 4, 4]})
#duplicated returns a boolean Series indicating whether each row is a duplicate or not:
data.duplicated()
#returns a DataFrame where the duplicated array is True:
data.drop_duplicates()
#specify the subset to detect duplicates
#filter based on column 'k1'
@khuyentran1401
khuyentran1401 / pandasMap.py
Created January 16, 2020 15:45
Use function for mapping to transform
data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami','corned beef', 'Bacon', 'pastrami', 'honey ham','nova lox'],'ounces': [4, 3, 12, 6,
#map between food to animal
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
data = DataFrame(np.arange(12).reshape((3, 4)),index=['Ohio', 'Colorado', 'New York'],columns=['one', 'two', 'three', 'four'])
data.index.map(str.upper)
#Out: array([OHIO, COLORADO, NEW YORK], dtype=object)
data.index = data.index.map(str.upper)
#Alternative
data.rename(index=str.title, columns=str.upper)