alexanderholt

## mult_hist.py
def gen_hist(df):  # plot all relationships
    heads = list(df.columns)
    while heads:
        base = heads.pop(0)
        for i in heads:
            sns.distplot(df[base])
            plt.show()
gen_hist(df_num)

## eda
def eda(dataframe):
  # this code is from @ritikabhasker, slightly adapted, but mainly hers.
    print("**MISSING VALUES** \n", dataframe.isnull().sum(), "\n")
    print("**DATAFRAME INDEX** \n", dataframe.index, "\n")
    print("**DATAFRAME TYPES** \n", dataframe.dtypes, "\n")
    print("**DATAFRAME SHAPE** \n", dataframe.shape, "\n")
    print("**DATAFRAME DESCRIBE** \n", dataframe.describe(), "\n")
    print("**NUMBER OF UNIQUE VALUES PER COLUMN**")
    for item in dataframe:
        print(item)

## savehighres
# Include this after you import matplotlib

import matplotlib.pyplot as plt
plt.rcParams['savefig.dpi'] = 500


.
.
.

## savehighres
# Include this after you import matplotlib

import matplotlib.pyplot as plt
plt.rcParams['savefig.dpi'] = 500


.
.
.

## removecharfromstring
test_strip = 'Braund, Mr. Owen Harris'
test_strip.split('.')[1].lstrip()
# . split splits string into two tuples on either side of string
# then I pull the second tuple
#.lstrip removes leading white space by default


bad[i].split('(')[1].lstrip().rstrip(')')

## regexscrape.py
# import regex

#generate code from https://regex101.com

import re

regex = r"\d+"

for book in html.find_all('div', class_='booking'):
    matches = re.search(regex, book.text) #make sur eyou change from find iter

## scrape_to_df.py
#Joe says to try and except every time

import pandas as pd
dc_eats = pd.DataFrame(columns=["name","location","price","bookings"])

# loop through each entry
for entry in html.find_all('div', {'class':'result content-section-list-row cf with-times'}):
    # grab the name
    name = entry.find('span', {'class': 'rest-row-name-text'}).text
    # grab the location

## selenium_next.py
# clicking the 'next' button in selenium

link = driver.find_element_by_link_text('Next')
link.click()

## nltk_import.py
string = 'some string of characters'

from nltk.stem import WordNetLemmatizer
#instantiate
lemmatizer = WordNetLemmatizer()

#Before we can lemmitize our spam string we need to tokenize it.

from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

## hasing_vectorizer.py
string = 'string of charactrers

hvec = HashingVectorizer()
hvec.fit([spam])

df  = pd.DataFrame(hvec.transform([string]).todense())
df.transpose().sort_values(0, ascending=False).head(10).transpose()
	def gen_hist(df): # plot all relationships
	heads = list(df.columns)
	while heads:
	base = heads.pop(0)
	for i in heads:
	sns.distplot(df[base])
	plt.show()
	gen_hist(df_num)
	def eda(dataframe):
	# this code is from @ritikabhasker, slightly adapted, but mainly hers.
	print("MISSING VALUES \n", dataframe.isnull().sum(), "\n")
	print("DATAFRAME INDEX \n", dataframe.index, "\n")
	print("DATAFRAME TYPES \n", dataframe.dtypes, "\n")
	print("DATAFRAME SHAPE \n", dataframe.shape, "\n")
	print("DATAFRAME DESCRIBE \n", dataframe.describe(), "\n")
	print("NUMBER OF UNIQUE VALUES PER COLUMN")
	for item in dataframe:
	print(item)
	# Include this after you import matplotlib

	import matplotlib.pyplot as plt
	plt.rcParams['savefig.dpi'] = 500


	.
	.
	.
	test_strip = 'Braund, Mr. Owen Harris'
	test_strip.split('.')[1].lstrip()
	# . split splits string into two tuples on either side of string
	# then I pull the second tuple
	#.lstrip removes leading white space by default


	bad[i].split('(')[1].lstrip().rstrip(')')
	# import regex

	#generate code from https://regex101.com

	import re

	regex = r"\d+"

	for book in html.find_all('div', class_='booking'):
	matches = re.search(regex, book.text) #make sur eyou change from find iter
	#Joe says to try and except every time

	import pandas as pd
	dc_eats = pd.DataFrame(columns=["name","location","price","bookings"])

	# loop through each entry
	for entry in html.find_all('div', {'class':'result content-section-list-row cf with-times'}):
	# grab the name
	name = entry.find('span', {'class': 'rest-row-name-text'}).text
	# grab the location
	# clicking the 'next' button in selenium

	link = driver.find_element_by_link_text('Next')
	link.click()
	string = 'some string of characters'

	from nltk.stem import WordNetLemmatizer
	#instantiate
	lemmatizer = WordNetLemmatizer()

	#Before we can lemmitize our spam string we need to tokenize it.

	from nltk.tokenize import RegexpTokenizer
	tokenizer = RegexpTokenizer(r'\w+')
	string = 'string of charactrers

	hvec = HashingVectorizer()
	hvec.fit([spam])

	df = pd.DataFrame(hvec.transform([string]).todense())
	df.transpose().sort_values(0, ascending=False).head(10).transpose()